diff options
author | Jan Zielinski <jan.zielinski@intel.com> | 2021-06-09 13:19:44 +0200 |
---|---|---|
committer | Marge Bot <emma+marge@anholt.net> | 2021-12-06 23:37:50 +0000 |
commit | 855793c6c6bd372ea96681ecbd3f318ad71da223 (patch) | |
tree | cbd8efc0c9df58d3bdc2ba774cf46dcdcad21162 | |
parent | d22d328859e4a67e6ff738fbd22eaf1d5a09376a (diff) |
gallium/swr: Remove driver source
The OpenSWR will be maintained on a classic/LTS branch.
Reviewed-by: Dylan Baker <dylan@pnwbakers.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11264>
178 files changed, 0 insertions, 85594 deletions
diff --git a/src/gallium/drivers/swr/.clang-format b/src/gallium/drivers/swr/.clang-format deleted file mode 100644 index 0ec65a5de88..00000000000 --- a/src/gallium/drivers/swr/.clang-format +++ /dev/null @@ -1,64 +0,0 @@ ---- -Language: Cpp -AccessModifierOffset: -3 -AlignAfterOpenBracket: true -AlignEscapedNewlinesLeft: false -AlignOperands: false -AlignTrailingComments: false -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false -AllowShortCaseLabelsOnASingleLine: false -AllowShortIfStatementsOnASingleLine: false -AllowShortLoopsOnASingleLine: false -AllowShortFunctionsOnASingleLine: All -AlwaysBreakAfterDefinitionReturnType: true -AlwaysBreakTemplateDeclarations: false -AlwaysBreakBeforeMultilineStrings: false -BreakBeforeBinaryOperators: NonAssignment -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: true -BinPackParameters: false -BinPackArguments: false -ColumnLimit: 78 -ConstructorInitializerAllOnOneLineOrOnePerLine: false -ConstructorInitializerIndentWidth: 3 -DerivePointerAlignment: false -ExperimentalAutoDetectBinPacking: false -IndentCaseLabels: false -IndentWrappedFunctionNames: false -IndentFunctionDeclarationAfterType: false -MaxEmptyLinesToKeep: 2 -KeepEmptyLinesAtTheStartOfBlocks: true -NamespaceIndentation: Inner -ObjCBlockIndentWidth: 3 -ObjCSpaceAfterProperty: true -ObjCSpaceBeforeProtocolList: true -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakString: 1000 -PenaltyBreakFirstLessLess: 120 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 0 -PointerAlignment: Right -SpacesBeforeTrailingComments: 1 -Cpp11BracedListStyle: true -Standard: Cpp11 -IndentWidth: 3 -TabWidth: 8 -UseTab: Never -BreakBeforeBraces: Linux -SpacesInParentheses: false -SpacesInSquareBrackets: false -SpacesInAngles: false -SpaceInEmptyParentheses: false -SpacesInCStyleCastParentheses: false -SpaceAfterCStyleCast: false -SpacesInContainerLiterals: true -SpaceBeforeAssignmentOperators: true -ContinuationIndentWidth: 3 -CommentPragmas: '^ IWYU pragma:' -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] -SpaceBeforeParens: ControlStatements -DisableFormat: false -... - diff --git a/src/gallium/drivers/swr/meson.build b/src/gallium/drivers/swr/meson.build deleted file mode 100644 index ac712d80461..00000000000 --- a/src/gallium/drivers/swr/meson.build +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright © 2017-2020 Intel Corporation - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -files_swr_common = files( - 'rasterizer/common/formats.cpp', - 'rasterizer/common/formats.h', - 'rasterizer/common/intrin.h', - 'rasterizer/common/isa.hpp', - 'rasterizer/common/os.cpp', - 'rasterizer/common/os.h', - 'rasterizer/common/rdtsc_buckets.cpp', - 'rasterizer/common/rdtsc_buckets.h', - 'rasterizer/common/rdtsc_buckets_shared.h', - 'rasterizer/common/rdtsc_buckets_shared.h', - 'rasterizer/common/simd16intrin.h', - 'rasterizer/common/simdintrin.h', - 'rasterizer/common/simdlib.hpp', - 'rasterizer/common/simdlib_interface.hpp', - 'rasterizer/common/simdlib_types.hpp', - 'rasterizer/common/swr_assert.cpp', - 'rasterizer/common/swr_assert.h', -) - -files_swr_mesa = files( - 'swr_loader.cpp', - 'swr_clear.cpp', - 'swr_context.cpp', - 'swr_context.h', - 'swr_draw.cpp', - 'swr_public.h', - 'swr_resource.h', - 'swr_screen.cpp', - 'swr_screen.h', - 'swr_state.cpp', - 'swr_state.h', - 'swr_tex_sample.cpp', - 'swr_tex_sample.h', - 'swr_scratch.h', - 'swr_scratch.cpp', - 'swr_shader.cpp', - 'swr_shader.h', - 'swr_memory.h', - 'swr_fence.h', - 'swr_fence.cpp', - 'swr_fence_work.h', - 'swr_fence_work.cpp', - 'swr_query.h', - 'swr_query.cpp', - 'rasterizer/jitter/blend_jit.cpp', - 'rasterizer/jitter/blend_jit.h', - 'rasterizer/jitter/builder.cpp', - 'rasterizer/jitter/builder.h', - 'rasterizer/jitter/builder_math.h', - 'rasterizer/jitter/builder_mem.cpp', - 'rasterizer/jitter/builder_mem.h', - 'rasterizer/jitter/builder_gfx_mem.cpp', - 'rasterizer/jitter/builder_gfx_mem.h', - 'rasterizer/jitter/builder_misc.cpp', - 'rasterizer/jitter/builder_misc.h', - 'rasterizer/jitter/fetch_jit.cpp', - 'rasterizer/jitter/fetch_jit.h', - 'rasterizer/jitter/jit_api.h', - 'rasterizer/jitter/JitManager.cpp', - 'rasterizer/jitter/JitManager.h', - 'rasterizer/jitter/streamout_jit.cpp', - 'rasterizer/jitter/streamout_jit.h', - 'rasterizer/jitter/shader_lib/DebugOutput.cpp', - 'rasterizer/jitter/shader_lib/Scatter.cpp', - 'rasterizer/jitter/functionpasses/lower_x86.cpp', - 'rasterizer/memory/SurfaceState.h' -) - -files_swr_arch = files( - 'rasterizer/archrast/archrast.cpp', - 'rasterizer/archrast/archrast.h', - 'rasterizer/archrast/eventmanager.h', - 'rasterizer/core/api.cpp', - 'rasterizer/core/api.h', - 'rasterizer/core/arena.h', - 'rasterizer/core/backend.cpp', - 'rasterizer/core/backend_clear.cpp', - 'rasterizer/core/backend_sample.cpp', - 'rasterizer/core/backend_singlesample.cpp', - 'rasterizer/core/backend.h', - 'rasterizer/core/backend_impl.h', - 'rasterizer/core/binner.cpp', - 'rasterizer/core/binner.h', - 'rasterizer/core/blend.h', - 'rasterizer/core/clip.cpp', - 'rasterizer/core/clip.h', - 'rasterizer/core/conservativeRast.h', - 'rasterizer/core/context.h', - 'rasterizer/core/depthstencil.h', - 'rasterizer/core/fifo.hpp', - 'rasterizer/core/format_conversion.h', - 'rasterizer/core/format_traits.h', - 'rasterizer/core/format_types.h', - 'rasterizer/core/format_utils.h', - 'rasterizer/core/frontend.cpp', - 'rasterizer/core/frontend.h', - 'rasterizer/core/knobs.h', - 'rasterizer/core/knobs_init.h', - 'rasterizer/core/multisample.h', - 'rasterizer/core/pa_avx.cpp', - 'rasterizer/core/pa.h', - 'rasterizer/core/rasterizer.cpp', - 'rasterizer/core/rasterizer.h', - 'rasterizer/core/rasterizer_impl.h', - 'rasterizer/core/rdtsc_core.cpp', - 'rasterizer/core/rdtsc_core.h', - 'rasterizer/core/ringbuffer.h', - 'rasterizer/core/state.h', - 'rasterizer/core/state_funcs.h', - 'rasterizer/core/tessellator.h', - 'rasterizer/core/tessellator.hpp', - 'rasterizer/core/tessellator.cpp', - 'rasterizer/core/threads.cpp', - 'rasterizer/core/threads.h', - 'rasterizer/core/tilemgr.cpp', - 'rasterizer/core/tilemgr.h', - 'rasterizer/core/tileset.h', - 'rasterizer/core/utils.h', - 'rasterizer/memory/ClearTile.cpp', - 'rasterizer/memory/Convert.h', - 'rasterizer/memory/LoadTile.cpp', - 'rasterizer/memory/LoadTile.h', - 'rasterizer/memory/LoadTile_Linear.cpp', - 'rasterizer/memory/LoadTile_TileX.cpp', - 'rasterizer/memory/LoadTile_TileY.cpp', - 'rasterizer/memory/StoreTile.cpp', - 'rasterizer/memory/StoreTile.h', - 'rasterizer/memory/StoreTile_Linear2.cpp', - 'rasterizer/memory/StoreTile_Linear.cpp', - 'rasterizer/memory/StoreTile_TileW.cpp', - 'rasterizer/memory/StoreTile_TileX2.cpp', - 'rasterizer/memory/StoreTile_TileX.cpp', - 'rasterizer/memory/StoreTile_TileY2.cpp', - 'rasterizer/memory/StoreTile_TileY.cpp', - 'rasterizer/memory/TilingFunctions.h', - 'rasterizer/memory/tilingtraits.h', - 'rasterizer/memory/InitMemory.h', - 'rasterizer/memory/InitMemory.cpp', - 'rasterizer/memory/SurfaceState.h' -) - -swr_context_files = files('swr_context.h') -swr_state_files = files('rasterizer/core/state.h') -swr_surf_state_files = files('rasterizer/memory/SurfaceState.h') -swr_event_proto_files = files('rasterizer/archrast/events.proto') -swr_event_pproto_files = files('rasterizer/archrast/events_private.proto') -swr_gen_backend_files = files('rasterizer/codegen/templates/gen_backend.cpp') -swr_gen_rasterizer_files = files('rasterizer/codegen/templates/gen_rasterizer.cpp') -swr_gen_header_init_files = files('rasterizer/codegen/templates/gen_header_init.hpp') - -swr_gen_llvm_ir_macros_py = files('rasterizer/codegen/gen_llvm_ir_macros.py') -swr_gen_backends_py = files('rasterizer/codegen/gen_backends.py') - -swr_gen_builder_depends = files( - 'rasterizer/codegen/templates/gen_builder.hpp', - 'rasterizer/codegen/gen_common.py' - ) - - -subdir('rasterizer/jitter') -subdir('rasterizer/codegen') -subdir('rasterizer/core/backends') - -swr_incs = include_directories( - 'rasterizer/codegen', 'rasterizer/core', 'rasterizer/jitter', - 'rasterizer/archrast', 'rasterizer', -) - -swr_cpp_args = [] -if cpp.has_argument('-fno-strict-aliasing') - swr_cpp_args += '-fno-strict-aliasing' -endif -if cpp.has_argument('-Wno-aligned-new') - swr_cpp_args += '-Wno-aligned-new' -endif - - -swr_arch_libs = [] -swr_defines = [] - -swr_avx_args = cpp.first_supported_argument( - '-target-cpu=sandybridge', '-mavx', '-march=core-avx', '-tp=sandybridge', - '/arch:AVX', -) -if swr_avx_args == [] - error('Cannot find AVX support for swr. (these are required for SWR an all architectures.)') -endif - -shared_swr = get_option('shared-swr') -if not shared_swr - if with_swr_arches.length() > 1 - error('When SWR is linked statically only one architecture is allowed.') - endif - swr_defines += '-DHAVE_SWR_BUILTIN' -endif - -if with_swr_arches.contains('skx') - swr_skx_args = cpp.first_supported_argument( - '-march=skylake-avx512', '-target-cpu=x86-skylake', '-xCORE-AVX512', - ) - if swr_skx_args == [] - error('Cannot find SKX support for swr.') - endif - - swr_defines += '-DHAVE_SWR_SKX' - if shared_swr - swr_arch_libs += shared_library( - 'swrSKX', - [files_swr_common, files_swr_arch], - cpp_args : [ - cpp_msvc_compat_args, swr_cpp_args, swr_skx_args, - '-DKNOB_ARCH=KNOB_ARCH_AVX512', - ], - gnu_symbol_visibility : 'hidden', - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - version : '0.0.0', - soversion : host_machine.system() == 'windows' ? '' : '0', - install : true, - name_prefix : host_machine.system() == 'windows' ? '' : 'lib', - ) - else - swr_arch_libs += static_library( - 'swrSKX', - [files_swr_common, files_swr_arch], - cpp_args : [ - cpp_msvc_compat_args, swr_cpp_args, swr_skx_args, - '-DKNOB_ARCH=KNOB_ARCH_AVX512', - ], - gnu_symbol_visibility : 'hidden', - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - ) - endif -endif - -if with_swr_arches.contains('knl') - swr_knl_args = cpp.first_supported_argument( - '-march=knl', '-target-cpu=mic-knl', '-xMIC-AVX512', - ) - if swr_knl_args == [] - error('Cannot find KNL support for swr.') - endif - - swr_defines += '-DHAVE_SWR_KNL' - if shared_swr - swr_arch_libs += shared_library( - 'swrKNL', - [files_swr_common, files_swr_arch], - cpp_args : [ - cpp_msvc_compat_args, swr_cpp_args, swr_knl_args, - '-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS', - ], - gnu_symbol_visibility : 'hidden', - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - version : '0.0.0', - soversion : host_machine.system() == 'windows' ? '' : '0', - install : true, - name_prefix : host_machine.system() == 'windows' ? '' : 'lib', - ) - else - swr_arch_libs += static_library( - 'swrKNL', - [files_swr_common, files_swr_arch], - cpp_args : [ - cpp_msvc_compat_args, swr_cpp_args, swr_knl_args, - '-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS', - ], - gnu_symbol_visibility : 'hidden', - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - ) - endif -endif - - -if with_swr_arches.contains('avx2') - swr_avx2_args = cpp.first_supported_argument( - '-target-cpu=haswell', '-march=core-avx2', '-tp=haswell', '/arch:AVX2', - ) - if swr_avx2_args == [] - if cpp.has_argument(['-mavx2', '-mfma', '-mbmi2', '-mf16c']) - swr_avx2_args = ['-mavx2', '-mfma', '-mbmi2', '-mf16c'] - else - error('Cannot find AVX2 support for swr.') - endif - endif - - swr_defines += '-DHAVE_SWR_AVX2' - if shared_swr - swr_arch_libs += shared_library( - 'swrAVX2', - [files_swr_common, files_swr_arch], - cpp_args : [ - cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args, - '-DKNOB_ARCH=KNOB_ARCH_AVX2', - ], - gnu_symbol_visibility : 'hidden', - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - version : '0.0.0', - soversion : host_machine.system() == 'windows' ? '' : '0', - install : true, - name_prefix : host_machine.system() == 'windows' ? '' : 'lib', - ) - else - swr_arch_libs += static_library( - 'swrAVX2', - [files_swr_common, files_swr_arch], - cpp_args : [ - cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args, - '-DKNOB_ARCH=KNOB_ARCH_AVX2', - ], - gnu_symbol_visibility : 'hidden', - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - ) - endif -endif - -if with_swr_arches.contains('avx') - swr_defines += '-DHAVE_SWR_AVX' - if shared_swr - swr_arch_libs += shared_library( - 'swrAVX', - [files_swr_common, files_swr_arch], - cpp_args : [ - cpp_msvc_compat_args, swr_cpp_args, swr_avx_args, - '-DKNOB_ARCH=KNOB_ARCH_AVX', - ], - gnu_symbol_visibility : 'hidden', - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - version : '0.0.0', - soversion : host_machine.system() == 'windows' ? '' : '0', - install : true, - name_prefix : host_machine.system() == 'windows' ? '' : 'lib', - ) - else - swr_arch_libs += static_library( - 'swrAVX', - [files_swr_common, files_swr_arch], - cpp_args : [ - cpp_msvc_compat_args, swr_cpp_args, swr_avx_args, - '-DKNOB_ARCH=KNOB_ARCH_AVX', - ], - gnu_symbol_visibility : 'hidden', - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - ) - endif -endif - - -if swr_arch_libs == [] - error('SWR configured, but no SWR architectures configured') -endif - -# The swr_avx_args are needed for intrensic usage in swr api headers. -libmesaswr = static_library( - 'mesaswr', - [files_swr_mesa, files_swr_common, gen_knobs_h, gen_knobs_cpp, - gen_builder_hpp, gen_builder_meta_hpp, gen_builder_intrin_hpp], - cpp_args : [ - cpp_msvc_compat_args, swr_cpp_args, swr_avx_args, - swr_defines, - ], - gnu_symbol_visibility : 'hidden', - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, swr_incs], - dependencies : [dep_llvm, idep_mesautil], -) - -link_libs = [libmesaswr] -if not shared_swr - link_libs += swr_arch_libs -endif - -driver_swr = declare_dependency( - compile_args : '-DGALLIUM_SWR', - link_with : link_libs -) diff --git a/src/gallium/drivers/swr/rasterizer/.dir-locals.el b/src/gallium/drivers/swr/rasterizer/.dir-locals.el deleted file mode 100644 index 2b04c18a9bb..00000000000 --- a/src/gallium/drivers/swr/rasterizer/.dir-locals.el +++ /dev/null @@ -1,8 +0,0 @@ -((prog-mode - (c-basic-offset . 4) - (c-file-style . "k&r") - (fill-column . 78) - (indent-tabs-mode . nil) - (show-trailing-whitespace . t) - ) - ) diff --git a/src/gallium/drivers/swr/rasterizer/_clang-format b/src/gallium/drivers/swr/rasterizer/_clang-format deleted file mode 100644 index ed4b9b409d8..00000000000 --- a/src/gallium/drivers/swr/rasterizer/_clang-format +++ /dev/null @@ -1,114 +0,0 @@ ---- -Language: Cpp -# BasedOnStyle: LLVM -AccessModifierOffset: -4 -AlignAfterOpenBracket: Align -AlignConsecutiveAssignments: true -AlignConsecutiveDeclarations: true -AlignEscapedNewlines: Left -AlignOperands: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: Inline -AllowShortIfStatementsOnASingleLine: false -AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakTemplateDeclarations: true -BinPackArguments: false -BinPackParameters: false -BraceWrapping: - AfterClass: true - AfterControlStatement: true - AfterEnum: true - AfterFunction: true - AfterNamespace: true - AfterObjCDeclaration: true - AfterStruct: true - AfterUnion: true - #AfterExternBlock: false - BeforeCatch: true - BeforeElse: true - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakBeforeBinaryOperators: None -BreakBeforeBraces: Custom -BreakBeforeInheritanceComma: false -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -BreakConstructorInitializers: AfterColon -BreakAfterJavaFieldAnnotations: false -BreakStringLiterals: true -ColumnLimit: 100 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerAllOnOneLineOrOnePerLine: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DerivePointerAlignment: false -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -#IncludeBlocks: Preserve -IncludeCategories: - - Regex: '^"(llvm|llvm-c|clang|clang-c)/' - Priority: 2 - - Regex: '^(<|"(gtest|gmock|isl|json)/)' - Priority: 3 - - Regex: '.*' - Priority: 1 -IncludeIsMainRegex: '(Test)?$' -IndentCaseLabels: false -#IndentPPDirectives: AfterHash -IndentWidth: 4 -IndentWrappedFunctionNames: false -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: All -ObjCBlockIndentWidth: 4 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 60 -PointerAlignment: Left -#RawStringFormats: -# - Delimiter: pb -# Language: TextProto -# BasedOnStyle: google -ReflowComments: true -SortIncludes: false -SortUsingDeclarations: true -SpaceAfterCStyleCast: false -SpaceAfterTemplateKeyword: true -SpaceBeforeAssignmentOperators: true -SpaceBeforeParens: ControlStatements -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 1 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -Standard: Cpp11 -TabWidth: 4 -UseTab: Never -... diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp deleted file mode 100644 index bcdc6d01358..00000000000 --- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp +++ /dev/null @@ -1,708 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file archrast.cpp - * - * @brief Implementation for archrast. - * - ******************************************************************************/ -#include <sys/stat.h> - -#include <atomic> -#include <map> - -#include "common/os.h" -#include "archrast/archrast.h" -#include "archrast/eventmanager.h" -#include "gen_ar_event.hpp" -#include "gen_ar_eventhandlerfile.hpp" - -namespace ArchRast -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief struct that keeps track of depth and stencil event information - struct DepthStencilStats - { - uint32_t earlyZTestPassCount = 0; - uint32_t earlyZTestFailCount = 0; - uint32_t lateZTestPassCount = 0; - uint32_t lateZTestFailCount = 0; - uint32_t earlyStencilTestPassCount = 0; - uint32_t earlyStencilTestFailCount = 0; - uint32_t lateStencilTestPassCount = 0; - uint32_t lateStencilTestFailCount = 0; - }; - - struct CStats - { - uint32_t trivialRejectCount; - uint32_t trivialAcceptCount; - uint32_t mustClipCount; - }; - - struct TEStats - { - uint32_t inputPrims = 0; - //@todo:: Change this to numPatches. Assumed: 1 patch per prim. If holds, its fine. - }; - - struct GSStateInfo - { - uint32_t inputPrimCount; - uint32_t primGeneratedCount; - uint32_t vertsInput; - }; - - struct RastStats - { - uint32_t rasterTiles = 0; - }; - - struct CullStats - { - uint32_t degeneratePrimCount = 0; - uint32_t backfacePrimCount = 0; - }; - - struct AlphaStats - { - uint32_t alphaTestCount = 0; - uint32_t alphaBlendCount = 0; - }; - - - ////////////////////////////////////////////////////////////////////////// - /// @brief Event handler that handles API thread events. This is shared - /// between the API and its caller (e.g. driver shim) but typically - /// there is only a single API thread per context. So you can save - /// information in the class to be used for other events. - class EventHandlerApiStats : public EventHandlerFile - { - public: - EventHandlerApiStats(uint32_t id) : EventHandlerFile(id) - { -#if defined(_WIN32) - // Attempt to copy the events.proto file to the ArchRast output dir. It's common for - // tools to place the events.proto file in the DEBUG_OUTPUT_DIR when launching AR. If it - // exists, this will attempt to copy it the first time we get here to package it with - // the stats. Otherwise, the user would need to specify the events.proto location when - // parsing the stats in post. - std::stringstream eventsProtoSrcFilename, eventsProtoDstFilename; - eventsProtoSrcFilename << KNOB_DEBUG_OUTPUT_DIR << "\\events.proto" << std::ends; - eventsProtoDstFilename << mOutputDir.substr(0, mOutputDir.size() - 1) - << "\\events.proto" << std::ends; - - // If event.proto already exists, we're done; else do the copy - struct stat buf; // Use a Posix stat for file existence check - if (!stat(eventsProtoDstFilename.str().c_str(), &buf) == 0) - { - // Now check to make sure the events.proto source exists - if (stat(eventsProtoSrcFilename.str().c_str(), &buf) == 0) - { - std::ifstream srcFile; - srcFile.open(eventsProtoSrcFilename.str().c_str(), std::ios::binary); - if (srcFile.is_open()) - { - // Just do a binary buffer copy - std::ofstream dstFile; - dstFile.open(eventsProtoDstFilename.str().c_str(), std::ios::binary); - dstFile << srcFile.rdbuf(); - dstFile.close(); - } - srcFile.close(); - } - } -#endif - } - - virtual void Handle(const DrawInstancedEvent& event) - { - DrawInfoEvent e(event.data.drawId, - ArchRast::Instanced, - event.data.topology, - event.data.numVertices, - 0, - 0, - event.data.startVertex, - event.data.numInstances, - event.data.startInstance, - event.data.tsEnable, - event.data.gsEnable, - event.data.soEnable, - event.data.soTopology, - event.data.splitId); - - EventHandlerFile::Handle(e); - } - - virtual void Handle(const DrawIndexedInstancedEvent& event) - { - DrawInfoEvent e(event.data.drawId, - ArchRast::IndexedInstanced, - event.data.topology, - 0, - event.data.numIndices, - event.data.indexOffset, - event.data.baseVertex, - event.data.numInstances, - event.data.startInstance, - event.data.tsEnable, - event.data.gsEnable, - event.data.soEnable, - event.data.soTopology, - event.data.splitId); - - EventHandlerFile::Handle(e); - } - }; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Event handler that handles worker thread events. There is one - /// event handler per thread. The python script will need to sum - /// up counters across all of the threads. - class EventHandlerWorkerStats : public EventHandlerFile - { - public: - EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false) - { - memset(mShaderStats, 0, sizeof(mShaderStats)); - } - - virtual void Handle(const EarlyDepthStencilInfoSingleSample& event) - { - // earlyZ test compute - mDSSingleSample.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSSingleSample.earlyZTestFailCount += - _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - - // earlyStencil test compute - mDSSingleSample.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSSingleSample.earlyStencilTestFailCount += - _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - - // earlyZ test single and multi sample - mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSCombined.earlyZTestFailCount += - _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - - // earlyStencil test single and multi sample - mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSCombined.earlyStencilTestFailCount += - _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - - mNeedFlush = true; - } - - virtual void Handle(const EarlyDepthStencilInfoSampleRate& event) - { - // earlyZ test compute - mDSSampleRate.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSSampleRate.earlyZTestFailCount += - _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - - // earlyStencil test compute - mDSSampleRate.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSSampleRate.earlyStencilTestFailCount += - _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - - // earlyZ test single and multi sample - mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSCombined.earlyZTestFailCount += - _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - - // earlyStencil test single and multi sample - mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSCombined.earlyStencilTestFailCount += - _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - - mNeedFlush = true; - } - - virtual void Handle(const EarlyDepthStencilInfoNullPS& event) - { - // earlyZ test compute - mDSNullPS.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSNullPS.earlyZTestFailCount += - _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - - // earlyStencil test compute - mDSNullPS.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSNullPS.earlyStencilTestFailCount += - _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - mNeedFlush = true; - } - - virtual void Handle(const LateDepthStencilInfoSingleSample& event) - { - // lateZ test compute - mDSSingleSample.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSSingleSample.lateZTestFailCount += - _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - - // lateStencil test compute - mDSSingleSample.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSSingleSample.lateStencilTestFailCount += - _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - - // lateZ test single and multi sample - mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSCombined.lateZTestFailCount += - _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - - // lateStencil test single and multi sample - mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSCombined.lateStencilTestFailCount += - _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - - mNeedFlush = true; - } - - virtual void Handle(const LateDepthStencilInfoSampleRate& event) - { - // lateZ test compute - mDSSampleRate.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSSampleRate.lateZTestFailCount += - _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - - // lateStencil test compute - mDSSampleRate.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSSampleRate.lateStencilTestFailCount += - _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - - // lateZ test single and multi sample - mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSCombined.lateZTestFailCount += - _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - - // lateStencil test single and multi sample - mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSCombined.lateStencilTestFailCount += - _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - - mNeedFlush = true; - } - - virtual void Handle(const LateDepthStencilInfoNullPS& event) - { - // lateZ test compute - mDSNullPS.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask); - mDSNullPS.lateZTestFailCount += - _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask); - - // lateStencil test compute - mDSNullPS.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask); - mDSNullPS.lateStencilTestFailCount += - _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask); - mNeedFlush = true; - } - - virtual void Handle(const EarlyDepthInfoPixelRate& event) - { - // earlyZ test compute - mDSPixelRate.earlyZTestPassCount += event.data.depthPassCount; - mDSPixelRate.earlyZTestFailCount += - (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount); - mNeedFlush = true; - } - - - virtual void Handle(const LateDepthInfoPixelRate& event) - { - // lateZ test compute - mDSPixelRate.lateZTestPassCount += event.data.depthPassCount; - mDSPixelRate.lateZTestFailCount += - (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount); - mNeedFlush = true; - } - - - virtual void Handle(const ClipInfoEvent& event) - { - mClipper.mustClipCount += _mm_popcnt_u32(event.data.clipMask); - mClipper.trivialRejectCount += - event.data.numInvocations - _mm_popcnt_u32(event.data.validMask); - mClipper.trivialAcceptCount += - _mm_popcnt_u32(event.data.validMask & ~event.data.clipMask); - } - - void UpdateStats(SWR_SHADER_STATS* pStatTotals, const SWR_SHADER_STATS* pStatUpdate) - { - pStatTotals->numInstExecuted += pStatUpdate->numInstExecuted; - pStatTotals->numSampleExecuted += pStatUpdate->numSampleExecuted; - pStatTotals->numSampleLExecuted += pStatUpdate->numSampleLExecuted; - pStatTotals->numSampleBExecuted += pStatUpdate->numSampleBExecuted; - pStatTotals->numSampleCExecuted += pStatUpdate->numSampleCExecuted; - pStatTotals->numSampleCLZExecuted += pStatUpdate->numSampleCLZExecuted; - pStatTotals->numSampleCDExecuted += pStatUpdate->numSampleCDExecuted; - pStatTotals->numGather4Executed += pStatUpdate->numGather4Executed; - pStatTotals->numGather4CExecuted += pStatUpdate->numGather4CExecuted; - pStatTotals->numGather4CPOExecuted += pStatUpdate->numGather4CPOExecuted; - pStatTotals->numGather4CPOCExecuted += pStatUpdate->numGather4CPOCExecuted; - pStatTotals->numLodExecuted += pStatUpdate->numLodExecuted; - } - - virtual void Handle(const VSStats& event) - { - SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats; - UpdateStats(&mShaderStats[SHADER_VERTEX], pStats); - } - - virtual void Handle(const GSStats& event) - { - SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats; - UpdateStats(&mShaderStats[SHADER_GEOMETRY], pStats); - } - - virtual void Handle(const DSStats& event) - { - SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats; - UpdateStats(&mShaderStats[SHADER_DOMAIN], pStats); - } - - virtual void Handle(const HSStats& event) - { - SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats; - UpdateStats(&mShaderStats[SHADER_HULL], pStats); - } - - virtual void Handle(const PSStats& event) - { - SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats; - UpdateStats(&mShaderStats[SHADER_PIXEL], pStats); - mNeedFlush = true; - } - - virtual void Handle(const CSStats& event) - { - SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats; - UpdateStats(&mShaderStats[SHADER_COMPUTE], pStats); - mNeedFlush = true; - } - - // Flush cached events for this draw - virtual void FlushDraw(uint32_t drawId) - { - if (mNeedFlush == false) - return; - - EventHandlerFile::Handle(PSInfo(drawId, - mShaderStats[SHADER_PIXEL].numInstExecuted, - mShaderStats[SHADER_PIXEL].numSampleExecuted, - mShaderStats[SHADER_PIXEL].numSampleLExecuted, - mShaderStats[SHADER_PIXEL].numSampleBExecuted, - mShaderStats[SHADER_PIXEL].numSampleCExecuted, - mShaderStats[SHADER_PIXEL].numSampleCLZExecuted, - mShaderStats[SHADER_PIXEL].numSampleCDExecuted, - mShaderStats[SHADER_PIXEL].numGather4Executed, - mShaderStats[SHADER_PIXEL].numGather4CExecuted, - mShaderStats[SHADER_PIXEL].numGather4CPOExecuted, - mShaderStats[SHADER_PIXEL].numGather4CPOCExecuted, - mShaderStats[SHADER_PIXEL].numLodExecuted)); - EventHandlerFile::Handle(CSInfo(drawId, - mShaderStats[SHADER_COMPUTE].numInstExecuted, - mShaderStats[SHADER_COMPUTE].numSampleExecuted, - mShaderStats[SHADER_COMPUTE].numSampleLExecuted, - mShaderStats[SHADER_COMPUTE].numSampleBExecuted, - mShaderStats[SHADER_COMPUTE].numSampleCExecuted, - mShaderStats[SHADER_COMPUTE].numSampleCLZExecuted, - mShaderStats[SHADER_COMPUTE].numSampleCDExecuted, - mShaderStats[SHADER_COMPUTE].numGather4Executed, - mShaderStats[SHADER_COMPUTE].numGather4CExecuted, - mShaderStats[SHADER_COMPUTE].numGather4CPOExecuted, - mShaderStats[SHADER_COMPUTE].numGather4CPOCExecuted, - mShaderStats[SHADER_COMPUTE].numLodExecuted)); - - // singleSample - EventHandlerFile::Handle(EarlyZSingleSample( - drawId, mDSSingleSample.earlyZTestPassCount, mDSSingleSample.earlyZTestFailCount)); - EventHandlerFile::Handle(LateZSingleSample( - drawId, mDSSingleSample.lateZTestPassCount, mDSSingleSample.lateZTestFailCount)); - EventHandlerFile::Handle( - EarlyStencilSingleSample(drawId, - mDSSingleSample.earlyStencilTestPassCount, - mDSSingleSample.earlyStencilTestFailCount)); - EventHandlerFile::Handle( - LateStencilSingleSample(drawId, - mDSSingleSample.lateStencilTestPassCount, - mDSSingleSample.lateStencilTestFailCount)); - - // sampleRate - EventHandlerFile::Handle(EarlyZSampleRate( - drawId, mDSSampleRate.earlyZTestPassCount, mDSSampleRate.earlyZTestFailCount)); - EventHandlerFile::Handle(LateZSampleRate( - drawId, mDSSampleRate.lateZTestPassCount, mDSSampleRate.lateZTestFailCount)); - EventHandlerFile::Handle( - EarlyStencilSampleRate(drawId, - mDSSampleRate.earlyStencilTestPassCount, - mDSSampleRate.earlyStencilTestFailCount)); - EventHandlerFile::Handle(LateStencilSampleRate(drawId, - mDSSampleRate.lateStencilTestPassCount, - mDSSampleRate.lateStencilTestFailCount)); - - // combined - EventHandlerFile::Handle( - EarlyZ(drawId, mDSCombined.earlyZTestPassCount, mDSCombined.earlyZTestFailCount)); - EventHandlerFile::Handle( - LateZ(drawId, mDSCombined.lateZTestPassCount, mDSCombined.lateZTestFailCount)); - EventHandlerFile::Handle(EarlyStencil(drawId, - mDSCombined.earlyStencilTestPassCount, - mDSCombined.earlyStencilTestFailCount)); - EventHandlerFile::Handle(LateStencil(drawId, - mDSCombined.lateStencilTestPassCount, - mDSCombined.lateStencilTestFailCount)); - - // pixelRate - EventHandlerFile::Handle(EarlyZPixelRate( - drawId, mDSPixelRate.earlyZTestPassCount, mDSPixelRate.earlyZTestFailCount)); - EventHandlerFile::Handle(LateZPixelRate( - drawId, mDSPixelRate.lateZTestPassCount, mDSPixelRate.lateZTestFailCount)); - - - // NullPS - EventHandlerFile::Handle( - EarlyZNullPS(drawId, mDSNullPS.earlyZTestPassCount, mDSNullPS.earlyZTestFailCount)); - EventHandlerFile::Handle(EarlyStencilNullPS( - drawId, mDSNullPS.earlyStencilTestPassCount, mDSNullPS.earlyStencilTestFailCount)); - - // Rasterized Subspans - EventHandlerFile::Handle(RasterTiles(drawId, rastStats.rasterTiles)); - - // Alpha Subspans - EventHandlerFile::Handle( - AlphaEvent(drawId, mAlphaStats.alphaTestCount, mAlphaStats.alphaBlendCount)); - - // Primitive Culling - EventHandlerFile::Handle( - CullEvent(drawId, mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount)); - - mDSSingleSample = {}; - mDSSampleRate = {}; - mDSCombined = {}; - mDSPixelRate = {}; - mDSNullPS = {}; - - rastStats = {}; - mCullStats = {}; - mAlphaStats = {}; - - mShaderStats[SHADER_PIXEL] = {}; - mShaderStats[SHADER_COMPUTE] = {}; - - mNeedFlush = false; - } - - virtual void Handle(const FrontendDrawEndEvent& event) - { - // Clipper - EventHandlerFile::Handle(ClipperEvent(event.data.drawId, - mClipper.trivialRejectCount, - mClipper.trivialAcceptCount, - mClipper.mustClipCount)); - - // Tesselator - EventHandlerFile::Handle(TessPrims(event.data.drawId, mTS.inputPrims)); - - // Geometry Shader - EventHandlerFile::Handle(GSInputPrims(event.data.drawId, mGS.inputPrimCount)); - EventHandlerFile::Handle(GSPrimsGen(event.data.drawId, mGS.primGeneratedCount)); - EventHandlerFile::Handle(GSVertsInput(event.data.drawId, mGS.vertsInput)); - - EventHandlerFile::Handle(VSInfo(event.data.drawId, - mShaderStats[SHADER_VERTEX].numInstExecuted, - mShaderStats[SHADER_VERTEX].numSampleExecuted, - mShaderStats[SHADER_VERTEX].numSampleLExecuted, - mShaderStats[SHADER_VERTEX].numSampleBExecuted, - mShaderStats[SHADER_VERTEX].numSampleCExecuted, - mShaderStats[SHADER_VERTEX].numSampleCLZExecuted, - mShaderStats[SHADER_VERTEX].numSampleCDExecuted, - mShaderStats[SHADER_VERTEX].numGather4Executed, - mShaderStats[SHADER_VERTEX].numGather4CExecuted, - mShaderStats[SHADER_VERTEX].numGather4CPOExecuted, - mShaderStats[SHADER_VERTEX].numGather4CPOCExecuted, - mShaderStats[SHADER_VERTEX].numLodExecuted)); - EventHandlerFile::Handle(HSInfo(event.data.drawId, - mShaderStats[SHADER_HULL].numInstExecuted, - mShaderStats[SHADER_HULL].numSampleExecuted, - mShaderStats[SHADER_HULL].numSampleLExecuted, - mShaderStats[SHADER_HULL].numSampleBExecuted, - mShaderStats[SHADER_HULL].numSampleCExecuted, - mShaderStats[SHADER_HULL].numSampleCLZExecuted, - mShaderStats[SHADER_HULL].numSampleCDExecuted, - mShaderStats[SHADER_HULL].numGather4Executed, - mShaderStats[SHADER_HULL].numGather4CExecuted, - mShaderStats[SHADER_HULL].numGather4CPOExecuted, - mShaderStats[SHADER_HULL].numGather4CPOCExecuted, - mShaderStats[SHADER_HULL].numLodExecuted)); - EventHandlerFile::Handle(DSInfo(event.data.drawId, - mShaderStats[SHADER_DOMAIN].numInstExecuted, - mShaderStats[SHADER_DOMAIN].numSampleExecuted, - mShaderStats[SHADER_DOMAIN].numSampleLExecuted, - mShaderStats[SHADER_DOMAIN].numSampleBExecuted, - mShaderStats[SHADER_DOMAIN].numSampleCExecuted, - mShaderStats[SHADER_DOMAIN].numSampleCLZExecuted, - mShaderStats[SHADER_DOMAIN].numSampleCDExecuted, - mShaderStats[SHADER_DOMAIN].numGather4Executed, - mShaderStats[SHADER_DOMAIN].numGather4CExecuted, - mShaderStats[SHADER_DOMAIN].numGather4CPOExecuted, - mShaderStats[SHADER_DOMAIN].numGather4CPOCExecuted, - mShaderStats[SHADER_DOMAIN].numLodExecuted)); - EventHandlerFile::Handle(GSInfo(event.data.drawId, - mShaderStats[SHADER_GEOMETRY].numInstExecuted, - mShaderStats[SHADER_GEOMETRY].numSampleExecuted, - mShaderStats[SHADER_GEOMETRY].numSampleLExecuted, - mShaderStats[SHADER_GEOMETRY].numSampleBExecuted, - mShaderStats[SHADER_GEOMETRY].numSampleCExecuted, - mShaderStats[SHADER_GEOMETRY].numSampleCLZExecuted, - mShaderStats[SHADER_GEOMETRY].numSampleCDExecuted, - mShaderStats[SHADER_GEOMETRY].numGather4Executed, - mShaderStats[SHADER_GEOMETRY].numGather4CExecuted, - mShaderStats[SHADER_GEOMETRY].numGather4CPOExecuted, - mShaderStats[SHADER_GEOMETRY].numGather4CPOCExecuted, - mShaderStats[SHADER_GEOMETRY].numLodExecuted)); - - mShaderStats[SHADER_VERTEX] = {}; - mShaderStats[SHADER_HULL] = {}; - mShaderStats[SHADER_DOMAIN] = {}; - mShaderStats[SHADER_GEOMETRY] = {}; - - // Reset Internal Counters - mClipper = {}; - mTS = {}; - mGS = {}; - } - - virtual void Handle(const GSPrimInfo& event) - { - mGS.inputPrimCount += event.data.inputPrimCount; - mGS.primGeneratedCount += event.data.primGeneratedCount; - mGS.vertsInput += event.data.vertsInput; - } - - virtual void Handle(const TessPrimCount& event) { mTS.inputPrims += event.data.primCount; } - - virtual void Handle(const RasterTileCount& event) - { - rastStats.rasterTiles += event.data.rasterTiles; - } - - virtual void Handle(const CullInfoEvent& event) - { - mCullStats.degeneratePrimCount += _mm_popcnt_u32( - event.data.validMask ^ (event.data.validMask & ~event.data.degeneratePrimMask)); - mCullStats.backfacePrimCount += _mm_popcnt_u32( - event.data.validMask ^ (event.data.validMask & ~event.data.backfacePrimMask)); - } - - virtual void Handle(const AlphaInfoEvent& event) - { - mAlphaStats.alphaTestCount += event.data.alphaTestEnable; - mAlphaStats.alphaBlendCount += event.data.alphaBlendEnable; - } - - protected: - bool mNeedFlush; - // Per draw stats - DepthStencilStats mDSSingleSample = {}; - DepthStencilStats mDSSampleRate = {}; - DepthStencilStats mDSPixelRate = {}; - DepthStencilStats mDSCombined = {}; - DepthStencilStats mDSNullPS = {}; - DepthStencilStats mDSOmZ = {}; - CStats mClipper = {}; - TEStats mTS = {}; - GSStateInfo mGS = {}; - RastStats rastStats = {}; - CullStats mCullStats = {}; - AlphaStats mAlphaStats = {}; - - SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES]; - - }; - - static EventManager* FromHandle(HANDLE hThreadContext) - { - return reinterpret_cast<EventManager*>(hThreadContext); - } - - // Construct an event manager and associate a handler with it. - HANDLE CreateThreadContext(AR_THREAD type) - { - // Can we assume single threaded here? - static std::atomic<uint32_t> counter(0); - uint32_t id = counter.fetch_add(1); - - EventManager* pManager = new EventManager(); - - if (pManager) - { - EventHandlerFile* pHandler = nullptr; - - if (type == AR_THREAD::API) - { - pHandler = new EventHandlerApiStats(id); - pManager->Attach(pHandler); - pHandler->Handle(ThreadStartApiEvent()); - } - else - { - pHandler = new EventHandlerWorkerStats(id); - pManager->Attach(pHandler); - pHandler->Handle(ThreadStartWorkerEvent()); - } - - pHandler->MarkHeader(); - - return pManager; - } - - SWR_INVALID("Failed to register thread."); - return nullptr; - } - - void DestroyThreadContext(HANDLE hThreadContext) - { - EventManager* pManager = FromHandle(hThreadContext); - SWR_ASSERT(pManager != nullptr); - - delete pManager; - } - - // Dispatch event for this thread. - void Dispatch(HANDLE hThreadContext, const Event& event) - { - if (event.IsEnabled()) - { - EventManager* pManager = reinterpret_cast<EventManager*>(hThreadContext); - SWR_ASSERT(pManager != nullptr); - pManager->Dispatch(event); - } - } - - // Flush for this thread. - void FlushDraw(HANDLE hThreadContext, uint32_t drawId) - { - EventManager* pManager = FromHandle(hThreadContext); - SWR_ASSERT(pManager != nullptr); - - pManager->FlushDraw(drawId); - } -} // namespace ArchRast diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h deleted file mode 100644 index a247443f54b..00000000000 --- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h +++ /dev/null @@ -1,49 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file archrast.h - * - * @brief Definitions for archrast. - * - ******************************************************************************/ -#pragma once - -#include "common/os.h" -#include "gen_ar_event.hpp" -#include "eventmanager.h" - -namespace ArchRast -{ - enum class AR_THREAD - { - API = 0, - WORKER = 1 - }; - - HANDLE CreateThreadContext(AR_THREAD type); - void DestroyThreadContext(HANDLE hThreadContext); - - // Dispatch event for this thread. - void Dispatch(HANDLE hThreadContext, const Event& event); - - void FlushDraw(HANDLE hThreadContext, uint32_t drawId); -}; // namespace ArchRast diff --git a/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h b/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h deleted file mode 100644 index 118a100e850..00000000000 --- a/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h +++ /dev/null @@ -1,88 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file archrast.h - * - * @brief Definitions for the event manager. - * - ******************************************************************************/ -#pragma once - -#include "common/os.h" - -#include "gen_ar_event.hpp" -#include "gen_ar_eventhandler.hpp" - -#include <vector> - -namespace ArchRast -{ - ////////////////////////////////////////////////////////////////////////// - /// EventManager - interface to dispatch events to handlers. - /// Event handling occurs only on a single thread. - ////////////////////////////////////////////////////////////////////////// - class EventManager - { - public: - EventManager() {} - - ~EventManager() - { - // Event manager owns destroying handler objects once attached. - ///@note See comment for Detach. - for (auto pHandler : mHandlers) - { - delete pHandler; - } - } - - void Attach(EventHandler* pHandler) - { - SWR_ASSERT(pHandler != nullptr); - mHandlers.push_back(pHandler); - } - - void Dispatch(const Event& event) - { - ///@todo Add event filter check here. - - for (auto pHandler : mHandlers) - { - event.Accept(pHandler); - } - } - - void FlushDraw(uint32_t drawId) - { - for (auto pHandler : mHandlers) - { - pHandler->FlushDraw(drawId); - } - } - - private: - // Handlers stay registered for life - void Detach(EventHandler* pHandler) { SWR_INVALID("Should not be called"); } - - std::vector<EventHandler*> mHandlers; - }; -}; // namespace ArchRast diff --git a/src/gallium/drivers/swr/rasterizer/archrast/events.proto b/src/gallium/drivers/swr/rasterizer/archrast/events.proto deleted file mode 100644 index 24739293a30..00000000000 --- a/src/gallium/drivers/swr/rasterizer/archrast/events.proto +++ /dev/null @@ -1,427 +0,0 @@ -# Copyright (C) 2016 Intel Corporation. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. -# -# Provides definitions for events. - -enum AR_DRAW_TYPE -{ - Instanced = 0, - IndexedInstanced = 1, - InstancedSplit = 2, - IndexedInstancedSplit = 3 -}; - -event Framework::ThreadStartApiEvent -{ -}; - -event Framework::ThreadStartWorkerEvent -{ -}; - -///@brief Used as a helper event to indicate end of frame. Does not guarantee to capture end of frame on all APIs -event ApiSwr::FrameEndEvent -{ - uint32_t frameId; // current frame id - uint32_t nextDrawId; // next draw id (always incremental - does not reset) -}; - -///@brief Synchronization event. -event ApiSwr::SwrSyncEvent -{ - uint32_t drawId; -}; - -///@brief Invalidate hot tiles (i.e. tile cache) -event ApiSwr::SwrInvalidateTilesEvent -{ - uint32_t drawId; -}; - -///@brief Invalidate and discard hot tiles within pixel region -event ApiSwr::SwrDiscardRectEvent -{ - uint32_t drawId; -}; - -///@brief Flush tiles out to memory that is typically owned by driver (e.g. Flush RT cache) -event ApiSwr::SwrStoreTilesEvent -{ - uint32_t drawId; -}; - -event PipelineStats::DrawInfoEvent -{ - uint32_t drawId; - AR_DRAW_TYPE type; // type of draw (indexed, instanced, etc) - uint32_t topology; // topology of draw - uint32_t numVertices; // number of vertices for draw - uint32_t numIndices; // number of indices for draw - int32_t indexOffset; // offset into index buffer - int32_t baseVertex; // which vertex to start with - uint32_t numInstances; // number of instances to draw - uint32_t startInstance; // which instance to start fetching - uint32_t tsEnable; // tesselation enabled - uint32_t gsEnable; // geometry shader enabled - uint32_t soEnable; // stream-out enabled - uint32_t soTopology; // topology of stream-out - uint32_t splitId; // split draw count or id -}; - -event PipelineStats::DispatchEvent -{ - uint32_t drawId; - uint32_t threadGroupCountX; // num thread groups in X dimension - uint32_t threadGroupCountY; // num thread groups in Y dimension - uint32_t threadGroupCountZ; // num thread groups in Z dimension -}; - -event PipelineStats::FrontendStatsEvent -{ - uint32_t drawId; - uint64_t IaVertices; - uint64_t IaPrimitives; - uint64_t VsInvocations; - uint64_t HsInvocations; - uint64_t DsInvocations; - uint64_t GsInvocations; - uint64_t GsPrimitives; - uint64_t CInvocations; - uint64_t CPrimitives; - uint64_t SoPrimStorageNeeded0; - uint64_t SoPrimStorageNeeded1; - uint64_t SoPrimStorageNeeded2; - uint64_t SoPrimStorageNeeded3; - uint64_t SoNumPrimsWritten0; - uint64_t SoNumPrimsWritten1; - uint64_t SoNumPrimsWritten2; - uint64_t SoNumPrimsWritten3; -}; - -event PipelineStats::BackendStatsEvent -{ - uint32_t drawId; - uint64_t DepthPassCount; - uint64_t PsInvocations; - uint64_t CsInvocations; - -}; - -event PipelineStats::EarlyZSingleSample -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::LateZSingleSample -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::EarlyStencilSingleSample -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::LateStencilSingleSample -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::EarlyZSampleRate -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::LateZSampleRate -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::EarlyStencilSampleRate -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::LateStencilSampleRate -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -// Total Early-Z counts, SingleSample and SampleRate -event PipelineStats::EarlyZ -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -// Total LateZ counts, SingleSample and SampleRate -event PipelineStats::LateZ -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -// Total EarlyStencil counts, SingleSample and SampleRate -event PipelineStats::EarlyStencil -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -// Total LateStencil counts, SingleSample and SampleRate -event PipelineStats::LateStencil -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::EarlyZNullPS -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::EarlyStencilNullPS -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::EarlyZPixelRate -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::LateZPixelRate -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - - -event PipelineStats::EarlyOmZ -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::EarlyOmStencil -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::LateOmZ -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::LateOmStencil -{ - uint32_t drawId; - uint64_t passCount; - uint64_t failCount; -}; - -event PipelineStats::GSInputPrims -{ - uint32_t drawId; - uint64_t inputPrimCount; -}; - -event PipelineStats::GSPrimsGen -{ - uint32_t drawId; - uint64_t primGeneratedCount; -}; - -event PipelineStats::GSVertsInput -{ - uint32_t drawId; - uint64_t vertsInput; -}; - -event PipelineStats::TessPrims -{ - uint32_t drawId; - uint64_t primCount; -}; - -event PipelineStats::RasterTiles -{ - uint32_t drawId; - uint32_t rastTileCount; -}; - -event PipelineStats::ClipperEvent -{ - uint32_t drawId; - uint32_t trivialRejectCount; - uint32_t trivialAcceptCount; - uint32_t mustClipCount; -}; - -event PipelineStats::CullEvent -{ - uint32_t drawId; - uint64_t backfacePrimCount; - uint64_t degeneratePrimCount; -}; - -event PipelineStats::AlphaEvent -{ - uint32_t drawId; - uint32_t alphaTestCount; - uint32_t alphaBlendCount; -}; - -event ShaderStats::VSInfo -{ - uint32_t drawId; - uint32_t numInstExecuted; - uint32_t numSampleExecuted; - uint32_t numSampleLExecuted; - uint32_t numSampleBExecuted; - uint32_t numSampleCExecuted; - uint32_t numSampleCLZExecuted; - uint32_t numSampleCDExecuted; - uint32_t numGather4Executed; - uint32_t numGather4CExecuted; - uint32_t numGather4CPOExecuted; - uint32_t numGather4CPOCExecuted; - uint32_t numLodExecuted; -}; - -event ShaderStats::HSInfo -{ - uint32_t drawId; - uint32_t numInstExecuted; - uint32_t numSampleExecuted; - uint32_t numSampleLExecuted; - uint32_t numSampleBExecuted; - uint32_t numSampleCExecuted; - uint32_t numSampleCLZExecuted; - uint32_t numSampleCDExecuted; - uint32_t numGather4Executed; - uint32_t numGather4CExecuted; - uint32_t numGather4CPOExecuted; - uint32_t numGather4CPOCExecuted; - uint32_t numLodExecuted; -}; - -event ShaderStats::DSInfo -{ - uint32_t drawId; - uint32_t numInstExecuted; - uint32_t numSampleExecuted; - uint32_t numSampleLExecuted; - uint32_t numSampleBExecuted; - uint32_t numSampleCExecuted; - uint32_t numSampleCLZExecuted; - uint32_t numSampleCDExecuted; - uint32_t numGather4Executed; - uint32_t numGather4CExecuted; - uint32_t numGather4CPOExecuted; - uint32_t numGather4CPOCExecuted; - uint32_t numLodExecuted; -}; - -event ShaderStats::GSInfo -{ - uint32_t drawId; - uint32_t numInstExecuted; - uint32_t numSampleExecuted; - uint32_t numSampleLExecuted; - uint32_t numSampleBExecuted; - uint32_t numSampleCExecuted; - uint32_t numSampleCLZExecuted; - uint32_t numSampleCDExecuted; - uint32_t numGather4Executed; - uint32_t numGather4CExecuted; - uint32_t numGather4CPOExecuted; - uint32_t numGather4CPOCExecuted; - uint32_t numLodExecuted; - -}; - -event ShaderStats::PSInfo -{ - uint32_t drawId; - uint32_t numInstExecuted; - uint32_t numSampleExecuted; - uint32_t numSampleLExecuted; - uint32_t numSampleBExecuted; - uint32_t numSampleCExecuted; - uint32_t numSampleCLZExecuted; - uint32_t numSampleCDExecuted; - uint32_t numGather4Executed; - uint32_t numGather4CExecuted; - uint32_t numGather4CPOExecuted; - uint32_t numGather4CPOCExecuted; - uint32_t numLodExecuted; -}; - -event ShaderStats::CSInfo -{ - uint32_t drawId; - uint32_t numInstExecuted; - uint32_t numSampleExecuted; - uint32_t numSampleLExecuted; - uint32_t numSampleBExecuted; - uint32_t numSampleCExecuted; - uint32_t numSampleCLZExecuted; - uint32_t numSampleCDExecuted; - uint32_t numGather4Executed; - uint32_t numGather4CExecuted; - uint32_t numGather4CPOExecuted; - uint32_t numGather4CPOCExecuted; - uint32_t numLodExecuted; -}; - diff --git a/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto b/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto deleted file mode 100644 index b57d5c4284f..00000000000 --- a/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto +++ /dev/null @@ -1,212 +0,0 @@ -# Copyright (C) 2018 Intel Corporation. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. -# -# Provides definitions for private internal events that are only used internally -# to rasty for communicating information between Rasty and Archrast. One goal for -# ArchRast is to not pollute the Rasty code with lots of calculations, etc. that -# are needed to compute per draw statistics, etc. - -event PipelineStats::EarlyDepthStencilInfoSingleSample -{ - uint64_t depthPassMask; - uint64_t stencilPassMask; - uint64_t coverageMask; -}; - -event PipelineStats::EarlyDepthStencilInfoSampleRate -{ - uint64_t depthPassMask; - uint64_t stencilPassMask; - uint64_t coverageMask; -}; - -event PipelineStats::EarlyDepthStencilInfoNullPS -{ - uint64_t depthPassMask; - uint64_t stencilPassMask; - uint64_t coverageMask; -}; - -event PipelineStats::LateDepthStencilInfoSingleSample -{ - uint64_t depthPassMask; - uint64_t stencilPassMask; - uint64_t coverageMask; -}; - -event PipelineStats::LateDepthStencilInfoSampleRate -{ - uint64_t depthPassMask; - uint64_t stencilPassMask; - uint64_t coverageMask; -}; - -event PipelineStats::LateDepthStencilInfoNullPS -{ - uint64_t depthPassMask; - uint64_t stencilPassMask; - uint64_t coverageMask; -}; - -event PipelineStats::EarlyDepthInfoPixelRate -{ - uint64_t depthPassCount; - uint64_t activeLanes; -}; - - -event PipelineStats::LateDepthInfoPixelRate -{ - uint64_t depthPassCount; - uint64_t activeLanes; -}; - - -event PipelineStats::BackendDrawEndEvent -{ - uint32_t drawId; -}; - -event PipelineStats::FrontendDrawEndEvent -{ - uint32_t drawId; -}; - -event Memory::MemoryAccessEvent -{ - uint32_t drawId; - uint64_t tsc; - uint64_t ptr; - uint32_t size; - uint8_t isRead; - uint8_t client; -}; - -event Memory::MemoryStatsEndEvent -{ - uint32_t drawId; -}; - -event PipelineStats::TessPrimCount -{ - uint64_t primCount; -}; - -event PipelineStats::RasterTileCount -{ - uint32_t drawId; - uint64_t rasterTiles; -}; - -event PipelineStats::GSPrimInfo -{ - uint64_t inputPrimCount; - uint64_t primGeneratedCount; - uint64_t vertsInput; -}; - -// validMask is primitives that still need to be clipped. They weren't rejected due to trivial reject or nan. -// clipMask is primitives that need to be clipped. So trivial accepts will be 0 while validMask for that is 1. -// Trivial reject is numInvocations - pop_cnt32(validMask) -// Trivial accept is validMask & ~clipMask -// Must clip count is pop_cnt32(clipMask) -event PipelineStats::ClipInfoEvent -{ - uint32_t numInvocations; - uint32_t validMask; - uint32_t clipMask; -}; - -event PipelineStats::CullInfoEvent -{ - uint32_t drawId; - uint64_t degeneratePrimMask; - uint64_t backfacePrimMask; - uint32_t validMask; -}; - -event PipelineStats::AlphaInfoEvent -{ - uint32_t drawId; - uint32_t alphaTestEnable; - uint32_t alphaBlendEnable; -}; - -event PipelineStats::DrawInstancedEvent -{ - uint32_t drawId; - uint32_t topology; - uint32_t numVertices; - int32_t startVertex; - uint32_t numInstances; - uint32_t startInstance; - uint32_t tsEnable; - uint32_t gsEnable; - uint32_t soEnable; - uint32_t soTopology; - uint32_t splitId; // Split draw count or id. -}; - -event PipelineStats::DrawIndexedInstancedEvent -{ - uint32_t drawId; - uint32_t topology; - uint32_t numIndices; - int32_t indexOffset; - int32_t baseVertex; - uint32_t numInstances; - uint32_t startInstance; - uint32_t tsEnable; - uint32_t gsEnable; - uint32_t soEnable; - uint32_t soTopology; - uint32_t splitId; // Split draw count or id. -}; - -event ShaderStats::VSStats -{ - HANDLE hStats; // SWR_SHADER_STATS -}; - -event ShaderStats::HSStats -{ - HANDLE hStats; // SWR_SHADER_STATS -}; - -event ShaderStats::DSStats -{ - HANDLE hStats; // SWR_SHADER_STATS -}; - -event ShaderStats::GSStats -{ - HANDLE hStats; // SWR_SHADER_STATS -}; - -event ShaderStats::PSStats -{ - HANDLE hStats; // SWR_SHADER_STATS -}; - -event ShaderStats::CSStats -{ - HANDLE hStats; // SWR_SHADER_STATS -};
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py deleted file mode 100644 index a4be675a34c..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py +++ /dev/null @@ -1,327 +0,0 @@ -# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. - -# Python source -import os -import sys -import re -from gen_common import * - -def parse_event_fields(lines, idx, event_dict): - """ - Parses lines from a proto file that contain an event definition and stores it in event_dict - """ - fields = [] - end_of_event = False - - # record all fields in event definition. - # note: we don't check if there's a leading brace. - while not end_of_event and idx < len(lines): - line = lines[idx].rstrip() - idx += 1 - - # ex 1: uint32_t numSampleCLZExecuted; // number of sample_cl_z instructions executed - # ex 2: char reason[256]; // size of reason - match = re.match(r'^(\s*)([\w\*]+)(\s+)([\w]+)(\[\d+\])*;\s*(\/\/.*)*$', line) - # group 1 - - # group 2 type - # group 3 - - # group 4 name - # group 5 [array size] - # group 6 //comment - - if match: - field = { - "type": match.group(2), - "name": match.group(4), - "size": int(match.group(5)[1:-1]) if match.group(5) else 1, - "desc": match.group(6)[2:].strip() if match.group(6) else "", - } - fields.append(field) - - end_of_event = re.match(r'(\s*)};', line) - - event_dict['fields'] = fields - event_dict['num_fields'] = len(fields) - - return idx - -def parse_enums(lines, idx, event_dict): - """ - Parses lines from a proto file that contain an enum definition and stores it in event_dict - """ - enum_names = [] - end_of_enum = False - - # record all enum values in enumeration - # note: we don't check if there's a leading brace. - while not end_of_enum and idx < len(lines): - line = lines[idx].rstrip() - idx += 1 - - preprocessor = re.search(r'#if|#endif', line) - - if not preprocessor: - enum = re.match(r'(\s*)(\w+)(\s*)', line) - - if enum: - enum_names.append(line) - - end_of_enum = re.match(r'(\s*)};', line) - - event_dict['names'] = enum_names - return idx - -def parse_protos(files, verbose=False): - """ - Parses a proto file and returns a dictionary of event definitions - """ - - # Protos structure: - # - # { - # "events": { - # "defs": { // dict of event definitions where keys are 'group_name::event_name" - # ..., - # "ApiStat::DrawInfoEvent": { - # "id": 3, - # "group": "ApiStat", - # "name": "DrawInfoEvent", // name of event without 'group_name::' prefix - # "desc": "", - # "fields": [ - # { - # "type": "uint32_t", - # "name": "drawId", - # "size": 1, - # "desc": "", - # }, - # ... - # ] - # }, - # ... - # }, - # "groups": { // dict of groups with lists of event keys - # "ApiStat": [ - # "ApiStat::DispatchEvent", - # "ApiStat::DrawInfoEvent", - # ... - # ], - # "Framework": [ - # "Framework::ThreadStartApiEvent", - # "Framework::ThreadStartWorkerEvent", - # ... - # ], - # ... - # }, - # "map": { // map of event ids to match archrast output to event key - # "1": "Framework::ThreadStartApiEvent", - # "2": "Framework::ThreadStartWorkerEvent", - # "3": "ApiStat::DrawInfoEvent", - # ... - # } - # }, - # "enums": { ... } // enums follow similar defs, map (groups?) structure - # } - - protos = { - 'events': { - 'defs': {}, # event dictionary containing events with their fields - 'map': {}, # dictionary to map event ids to event names - 'groups': {} # event keys stored by groups - }, - 'enums': { - 'defs': {}, - 'map': {} - } - } - - event_id = 0 - enum_id = 0 - - if type(files) is not list: - files = [files] - - for filename in files: - if verbose: - print("Parsing proto file: %s" % os.path.normpath(filename)) - - with open(filename, 'r') as f: - lines = f.readlines() - in_brief = False - brief = [] - idx = 0 - while idx < len(lines): - line = lines[idx].strip() - idx += 1 - - # If currently processing a brief, keep processing or change state - if in_brief: - match = re.match(r'^\s*\/\/\/\s*(.*)$', line) # i.e. "/// more event desc..." - if match: - brief.append(match.group(1).strip()) - continue - else: - in_brief = False - - # Match event/enum brief - match = re.match(r'^\s*\/\/\/\s*@(brief|breif)\s*(.*)$', line) # i.e. "///@brief My event desc..." - if match: - in_brief = True - brief.append(match.group(2).strip()) - continue - - # Match event definition - match = re.match(r'event(\s*)(((\w*)::){0,1}(\w+))', line) # i.e. "event SWTag::CounterEvent" - if match: - event_id += 1 - - # Parse event attributes - event_key = match.group(2) # i.e. SWTag::CounterEvent - event_group = match.group(4) if match.group(4) else "" # i.e. SWTag - event_name = match.group(5) # i.e. CounterEvent - - # Define event attributes - event = { - 'id': event_id, - 'group': event_group, - 'name': event_name, - 'desc': ' '.join(brief) - } - # Add period at end of event desc if necessary - if event["desc"] and event["desc"][-1] != '.': - event["desc"] += '.' - - # Reset brief - brief = [] - - # Now add event fields - idx = parse_event_fields(lines, idx, event) - - # Register event and mapping - protos['events']['defs'][event_key] = event - protos['events']['map'][event_id] = event_key - - continue - - # Match enum definition - match = re.match(r'enum(\s*)(\w+)', line) - if match: - enum_id += 1 - - # Parse enum attributes - enum_name = match.group(2) - - # Define enum attr - enum = { - 'name': enum_name, - 'desc': ' '.join(brief) - } - # Add period at end of event desc if necessary - if enum["desc"] and enum["desc"][-1] != '.': - enum["desc"] += '.' - - # Reset brief - brief = [] - - # Now add enum fields - idx = parse_enums(lines, idx, enum) - - # Register enum and mapping - protos['enums']['defs'][enum_name] = enum - protos['enums']['map'][enum_id] = enum_name - - continue - - # Sort and group events - event_groups = protos['events']['groups'] - for key in sorted(protos['events']['defs']): - group = protos['events']['defs'][key]['group'] - if group not in event_groups: - event_groups[group] = [] - event_groups[group].append(key) - - return protos - - -def main(): - - # Parse args... - parser = ArgumentParser() - parser.add_argument("--proto", "-p", dest="protos", nargs='+', help="Path to all proto file(s) to process. Accepts one or more paths (i.e. events.proto and events_private.proto)", required=True) - parser.add_argument("--output-dir", help="Output dir (defaults to ./codegen). Will create folder if it does not exist.", required=False, default="codegen") - parser.add_argument("--verbose", "-v", help="Verbose", action="store_true") - args = parser.parse_args() - - if not os.path.exists(args.output_dir): - MakeDir(args.output_dir) - - for f in args.protos: - if not os.path.exists(f): - print('Error: Could not find proto file %s' % f, file=sys.stderr) - return 1 - - # Parse each proto file and add to protos container - protos = parse_protos(args.protos, args.verbose) - - files = [ - ["gen_ar_event.hpp", ""], - ["gen_ar_event.cpp", ""], - ["gen_ar_eventhandler.hpp", "gen_ar_event.hpp"], - ["gen_ar_eventhandlerfile.hpp", "gen_ar_eventhandler.hpp"] - ] - - rval = 0 - - try: - # Delete existing files - for f in files: - filename = f[0] - output_fullpath = os.path.join(args.output_dir, filename) - if os.path.exists(output_fullpath): - if args.verbose: - print("Deleting existing file: %s" % output_fullpath) - os.remove(output_fullpath) - - # Generate files from templates - print("Generating c++ from proto files...") - for f in files: - filename = f[0] - event_header = f[1] - curdir = os.path.dirname(os.path.abspath(__file__)) - template_file = os.path.join(curdir, 'templates', filename) - output_fullpath = os.path.join(args.output_dir, filename) - - if args.verbose: - print("Generating: %s" % output_fullpath) - MakoTemplateWriter.to_file(template_file, output_fullpath, - cmdline=sys.argv, - filename=filename, - protos=protos, - event_header=event_header) - - except Exception as e: - print(e) - rval = 1 - - return rval - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py deleted file mode 100644 index eb51a3a8a13..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (C) 2017-2018 Intel Corporation. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the 'Software'), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. - -# Python source - -import itertools -import os -import sys -from gen_common import * - - -def main(args=sys.argv[1:]): - thisDir = os.path.dirname(os.path.realpath(__file__)) - parser = ArgumentParser('Generate files and initialization functions for all permutations of BackendPixelRate.') - parser.add_argument('--dim', help='gBackendPixelRateTable array dimensions', nargs='+', type=int, required=True) - parser.add_argument('--outdir', help='output directory', nargs='?', type=str, default=thisDir) - parser.add_argument('--split', help='how many lines of initialization per file [0=no split]', nargs='?', type=int, default='512') - parser.add_argument('--numfiles', help='how many output files to generate', nargs='?', type=int, default='0') - parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False) - parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False) - parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False) - parser.add_argument('--rast', help='Generate rasterizer functions instead of normal backend', action='store_true', default=False) - - args = parser.parse_args(args) - - - class backendStrs : - def __init__(self) : - self.outFileName = 'gen_BackendPixelRate%s.cpp' - self.outHeaderName = 'gen_BackendPixelRate.hpp' - self.functionTableName = 'gBackendPixelRateTable' - self.funcInstanceHeader = ' = BackendPixelRate<SwrBackendTraits<' - self.template = 'gen_backend.cpp' - self.hpp_template = 'gen_header_init.hpp' - self.cmakeFileName = 'gen_backends.cmake' - self.cmakeSrcVar = 'GEN_BACKEND_SOURCES' - self.tableName = 'BackendPixelRate' - - if args.rast: - self.outFileName = 'gen_rasterizer%s.cpp' - self.outHeaderName = 'gen_rasterizer.hpp' - self.functionTableName = 'gRasterizerFuncs' - self.funcInstanceHeader = ' = RasterizeTriangle<RasterizerTraits<' - self.template = 'gen_rasterizer.cpp' - self.cmakeFileName = 'gen_rasterizer.cmake' - self.cmakeSrcVar = 'GEN_RASTERIZER_SOURCES' - self.tableName = 'RasterizerFuncs' - - - backend = backendStrs() - - output_list = [] - for x in args.dim: - output_list.append(list(range(x))) - - # generate all permutations possible for template parameter inputs - output_combinations = list(itertools.product(*output_list)) - output_list = [] - - # for each permutation - for x in range(len(output_combinations)): - # separate each template peram into its own list member - new_list = [output_combinations[x][i] for i in range(len(output_combinations[x]))] - tempStr = backend.functionTableName - #print each list member as an index in the multidimensional array - for i in new_list: - tempStr += '[' + str(i) + ']' - #map each entry in the permutation as its own string member, store as the template instantiation string - tempStr += backend.funcInstanceHeader + ','.join(map(str, output_combinations[x])) + '>>;' - #append the line of c++ code in the list of output lines - output_list.append(tempStr) - - # how many files should we split the global template initialization into? - if (args.split == 0): - numFiles = 1 - else: - numFiles = (len(output_list) + args.split - 1) // args.split - if (args.numfiles != 0): - numFiles = args.numfiles - linesPerFile = (len(output_list) + numFiles - 1) // numFiles - chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)] - - tmp_output_dir = MakeTmpDir('_codegen') - - if not os.path.exists(args.outdir): - try: - os.makedirs(args.outdir) - except OSError as err: - if err.errno != errno.EEXIST: - print('ERROR: Could not create directory:', args.outdir, file=sys.stderr) - return 1 - - rval = 0 - - # generate .cpp files - try: - if args.cpp: - baseCppName = os.path.join(tmp_output_dir, backend.outFileName) - templateCpp = os.path.join(thisDir, 'templates', backend.template) - - for fileNum in range(numFiles): - filename = baseCppName % str(fileNum) - MakoTemplateWriter.to_file( - templateCpp, - baseCppName % str(fileNum), - cmdline=sys.argv, - fileNum=fileNum, - funcList=chunkedList[fileNum]) - - if args.hpp: - baseHppName = os.path.join(tmp_output_dir, backend.outHeaderName) - templateHpp = os.path.join(thisDir, 'templates', backend.hpp_template) - - MakoTemplateWriter.to_file( - templateHpp, - baseHppName, - cmdline=sys.argv, - numFiles=numFiles, - filename=backend.outHeaderName, - tableName=backend.tableName) - - # generate gen_backend.cmake file - if args.cmake: - templateCmake = os.path.join(thisDir, 'templates', 'gen_backend.cmake') - cmakeFile = os.path.join(tmp_output_dir, backend.cmakeFileName) - - MakoTemplateWriter.to_file( - templateCmake, - cmakeFile, - cmdline=sys.argv, - srcVar=backend.cmakeSrcVar, - numFiles=numFiles, - baseCppName='${RASTY_GEN_SRC_DIR}/backends/' + os.path.basename(baseCppName)) - - rval = CopyDirFilesIfDifferent(tmp_output_dir, args.outdir) - - except: - rval = 1 - - finally: - DeleteDirTree(tmp_output_dir) - - return rval - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py deleted file mode 100644 index c1d08fb83bc..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py +++ /dev/null @@ -1,291 +0,0 @@ -# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. - -# Python source -import os -import errno -import sys -import argparse -import tempfile -import filecmp -import shutil -from mako.template import Template -from mako.exceptions import RichTraceback - -#============================================================================== -def ConcatLists(list_of_lists): - output = [] - for l in list_of_lists: output += l - return output - -#============================================================================== -def MakeTmpDir(suffix=''): - ''' - Create temporary directory for use in codegen scripts. - ''' - return tempfile.mkdtemp(suffix) - -#============================================================================== -def MakeDir(dir_path): - ''' - Create a directory if it doesn't exist - - returns 0 on success, non-zero on failure - ''' - dir_path = os.path.abspath(dir_path) - - if not os.path.exists(dir_path): - try: - os.makedirs(dir_path) - except OSError as err: - if err.errno != errno.EEXIST: - return 1 - else: - if not os.path.isdir(dir_path): - return 1 - - return 0 - -#============================================================================== -def DeleteDirTree(dir_path): - ''' - Delete directory tree. - - returns 0 on success, non-zero on failure - ''' - rval = 0 - try: - shutil.rmtree(dir_path, False) - except: - rval = 1 - return rval - -#============================================================================== -def CopyFileIfDifferent(src, dst, verbose = False): - ''' - Copy <src> file to <dst> file if the <dst> - file either doesn't contain the file or the file - contents are different. - - returns 0 on success, non-zero on failure - ''' - - assert os.path.isfile(src) - assert (False == os.path.exists(dst) or os.path.isfile(dst)) - - need_copy = not os.path.exists(dst) - if not need_copy: - need_copy = not filecmp.cmp(src, dst) - - if need_copy: - try: - shutil.copy2(src, dst) - except: - print('ERROR: Could not copy %s to %s' % (src, dst), file=sys.stderr) - return 1 - - if verbose: - print(src, '-->', dst) - - return 0 - -#============================================================================== -def CopyDirFilesIfDifferent(src, dst, recurse = True, verbose = False, orig_dst = None): - ''' - Copy files <src> directory to <dst> directory if the <dst> - directory either doesn't contain the file or the file - contents are different. - - Optionally recurses into subdirectories - - returns 0 on success, non-zero on failure - ''' - - assert os.path.isdir(src) - assert os.path.isdir(dst) - - src = os.path.abspath(src) - dst = os.path.abspath(dst) - - if not orig_dst: - orig_dst = dst - - for f in os.listdir(src): - src_path = os.path.join(src, f) - dst_path = os.path.join(dst, f) - - # prevent recursion - if src_path == orig_dst: - continue - - if os.path.isdir(src_path): - if recurse: - if MakeDir(dst_path): - print('ERROR: Could not create directory:', dst_path, file=sys.stderr) - return 1 - - if verbose: - print('mkdir', dst_path) - rval = CopyDirFilesIfDifferent(src_path, dst_path, recurse, verbose, orig_dst) - else: - rval = CopyFileIfDifferent(src_path, dst_path, verbose) - - if rval: - return rval - - return 0 - -#============================================================================== -class MakoTemplateWriter: - ''' - MakoTemplateWriter - Class (namespace) for functions to generate strings - or files using the Mako template module. - - See http://docs.makotemplates.org/en/latest/ for - mako documentation. - ''' - - @staticmethod - def to_string(template_filename, **kwargs): - ''' - Write template data to a string object and return the string - ''' - from mako.template import Template - from mako.exceptions import RichTraceback - - try: - template = Template(filename=template_filename) - # Split + Join fixes line-endings for whatever platform you are using - return '\n'.join(template.render(**kwargs).splitlines()) - except: - traceback = RichTraceback() - for (filename, lineno, function, line) in traceback.traceback: - print('File %s, line %s, in %s' % (filename, lineno, function)) - print(line, '\n') - print('%s: %s' % (str(traceback.error.__class__.__name__), traceback.error)) - raise - - @staticmethod - def to_file(template_filename, output_filename, **kwargs): - ''' - Write template data to a file - ''' - if MakeDir(os.path.dirname(output_filename)): - return 1 - with open(output_filename, 'w') as outfile: - print(MakoTemplateWriter.to_string(template_filename, **kwargs), file=outfile) - return 0 - - -#============================================================================== -class ArgumentParser(argparse.ArgumentParser): - ''' - Subclass of argparse.ArgumentParser - - Allow parsing from command files that start with @ - Example: - >bt run @myargs.txt - - Contents of myargs.txt: - -m <machine> - --target cdv_win7 - - The below function allows multiple args to be placed on the same text-file line. - The default is one token per line, which is a little cumbersome. - - Also allow all characters after a '#' character to be ignored. - ''' - - #============================================================================== - class _HelpFormatter(argparse.RawTextHelpFormatter): - ''' Better help formatter for argument parser ''' - - def _split_lines(self, text, width): - ''' optimized split lines algorithm, indents split lines ''' - lines = text.splitlines() - out_lines = [] - if len(lines): - out_lines.append(lines[0]) - for line in lines[1:]: - out_lines.append(' ' + line) - return out_lines - - #============================================================================== - def __init__(self, *args, **kwargs): - ''' Constructor. Compatible with argparse.ArgumentParser(), - but with some modifications for better usage and help display. - ''' - super(ArgumentParser, self).__init__( - *args, - fromfile_prefix_chars='@', - formatter_class=ArgumentParser._HelpFormatter, - **kwargs) - - #========================================================================== - def convert_arg_line_to_args(self, arg_line): - ''' convert one line of parsed file to arguments ''' - arg_line = arg_line.split('#', 1)[0] - if sys.platform == 'win32': - arg_line = arg_line.replace('\\', '\\\\') - for arg in shlex.split(arg_line): - if not arg.strip(): - continue - yield arg - - #========================================================================== - def _read_args_from_files(self, arg_strings): - ''' read arguments from files ''' - # expand arguments referencing files - new_arg_strings = [] - for arg_string in arg_strings: - - # for regular arguments, just add them back into the list - if arg_string[0] not in self.fromfile_prefix_chars: - new_arg_strings.append(arg_string) - - # replace arguments referencing files with the file content - else: - filename = arg_string[1:] - - # Search in sys.path - if not os.path.exists(filename): - for path in sys.path: - filename = os.path.join(path, arg_string[1:]) - if os.path.exists(filename): - break - - try: - args_file = open(filename) - try: - arg_strings = [] - for arg_line in args_file.read().splitlines(): - for arg in self.convert_arg_line_to_args(arg_line): - arg_strings.append(arg) - arg_strings = self._read_args_from_files(arg_strings) - new_arg_strings.extend(arg_strings) - finally: - args_file.close() - except IOError: - err = sys.exc_info()[1] - self.error(str(err)) - - # return the modified argument list - return new_arg_strings diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py deleted file mode 100644 index bd39ef645f7..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. - -# Python source -import os -import sys -import knob_defs -from gen_common import * - -def main(args=sys.argv[1:]): - - # parse args - parser = ArgumentParser() - parser.add_argument("--output", "-o", help="Path to output file", required=True) - parser.add_argument("--gen_h", "-gen_h", help="Generate gen_knobs.h", action="store_true", default=False) - parser.add_argument("--gen_cpp", "-gen_cpp", help="Generate gen_knobs.cpp", action="store_true", required=False) - - args = parser.parse_args() - - cur_dir = os.path.dirname(os.path.abspath(__file__)) - template_cpp = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp') - template_h = os.path.join(cur_dir, 'templates', 'gen_knobs.h') - - output_filename = os.path.basename(args.output) - output_dir = MakeTmpDir('_codegen') - - output_file = os.path.join(output_dir, output_filename) - - rval = 0 - - try: - if args.gen_h: - MakoTemplateWriter.to_file( - template_h, - output_file, - cmdline=sys.argv, - filename='gen_knobs', - knobs=knob_defs.KNOBS) - - if args.gen_cpp: - MakoTemplateWriter.to_file( - template_cpp, - output_file, - cmdline=sys.argv, - filename='gen_knobs', - knobs=knob_defs.KNOBS, - includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip']) - - rval = CopyFileIfDifferent(output_file, args.output) - - except: - rval = 1 - - finally: - # ignore errors from delete of tmp directory - DeleteDirTree(output_dir) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) - diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py deleted file mode 100644 index f3ab7120a43..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ /dev/null @@ -1,362 +0,0 @@ -# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. - -import os, sys, re -from gen_common import * -from argparse import FileType - -inst_aliases = { - 'SHUFFLE_VECTOR': 'VSHUFFLE', - 'INSERT_ELEMENT': 'VINSERT', - 'EXTRACT_ELEMENT': 'VEXTRACT', - 'MEM_SET': 'MEMSET', - 'MEM_CPY': 'MEMCOPY', - 'MEM_MOVE': 'MEMMOVE', - 'L_SHR': 'LSHR', - 'A_SHR': 'ASHR', - 'BIT_CAST': 'BITCAST', - 'U_DIV': 'UDIV', - 'S_DIV': 'SDIV', - 'U_REM': 'UREM', - 'S_REM': 'SREM', - 'BIN_OP': 'BINOP', -} - -intrinsics = [ - ['VGATHERPD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'], - ['VGATHERPS', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'], - ['VGATHERDD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'], - ['VSCATTERPS', ['pBase', 'mask', 'indices', 'src', 'scale'], 'src'], - ['VRCPPS', ['a'], 'a'], - ['VROUND', ['a', 'rounding'], 'a'], - ['BEXTR_32', ['src', 'control'], 'src'], - ['VPSHUFB', ['a', 'b'], 'a'], - ['VPERMD', ['a', 'idx'], 'a'], - ['VPERMPS', ['idx', 'a'], 'a'], - ['VCVTPD2PS', ['a'], 'getVectorType(mFP32Ty, VEC_GET_NUM_ELEMS)'], - ['VCVTPS2PH', ['a', 'round'], 'mSimdInt16Ty'], - ['VHSUBPS', ['a', 'b'], 'a'], - ['VPTESTC', ['a', 'b'], 'mInt32Ty'], - ['VPTESTZ', ['a', 'b'], 'mInt32Ty'], - ['VPHADDD', ['a', 'b'], 'a'], - ['PDEP32', ['a', 'b'], 'a'], - ['RDTSC', [], 'mInt64Ty'], -] - -llvm_intrinsics = [ - ['CTTZ', 'cttz', ['a', 'flag'], ['a']], - ['CTLZ', 'ctlz', ['a', 'flag'], ['a']], - ['VSQRTPS', 'sqrt', ['a'], ['a']], - ['STACKSAVE', 'stacksave', [], []], - ['STACKRESTORE', 'stackrestore', ['a'], []], - ['VMINPS', 'minnum', ['a', 'b'], ['a']], - ['VMAXPS', 'maxnum', ['a', 'b'], ['a']], - ['VFMADDPS', 'fmuladd', ['a', 'b', 'c'], ['a']], - ['DEBUGTRAP', 'debugtrap', [], []], - ['POPCNT', 'ctpop', ['a'], ['a']], - ['LOG2', 'log2', ['a'], ['a']], - ['FABS', 'fabs', ['a'], ['a']], - ['EXP2', 'exp2', ['a'], ['a']], - ['COS', 'cos', ['a'], ['a']], - ['SIN', 'sin', ['a'], ['a']], - ['FLOOR', 'floor', ['a'], ['a']], - ['POW', 'pow', ['a', 'b'], ['a']] -] - -this_dir = os.path.dirname(os.path.abspath(__file__)) -template = os.path.join(this_dir, 'templates', 'gen_builder.hpp') - -def convert_uppercamel(name): - s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).upper() - -''' - Given an input file (e.g. IRBuilder.h) generates function dictionary. -''' -def parse_ir_builder(input_file): - - functions = [] - - lines = input_file.readlines() - deprecated = None - - idx = 0 - while idx < len(lines) - 1: - line = lines[idx].rstrip() - idx += 1 - - if deprecated is None: - deprecated = re.search(r'LLVM_ATTRIBUTE_DEPRECATED', line) - - #match = re.search(r'\*Create', line) - match = re.search(r'[\*\s]Create(\w*)\(', line) - if match is not None: - #print('Line: %s' % match.group(1)) - - # Skip function if LLVM_ATTRIBUTE_DEPRECATED found before - if deprecated is not None: - deprecated = None - continue - - if re.search(r'^\s*Create', line) is not None: - func_sig = lines[idx-2].rstrip() + line - else: - func_sig = line - - end_of_args = False - while not end_of_args: - end_paren = re.search(r'\)', line) - if end_paren is not None: - end_of_args = True - else: - line = lines[idx].rstrip() - func_sig += line - idx += 1 - - delfunc = re.search(r'LLVM_DELETED_FUNCTION|= delete;', func_sig) - - if not delfunc: - func = re.search(r'(.*?)\*[\n\s]*(Create\w*)\((.*?)\)', func_sig) - if func is not None: - - return_type = func.group(1).strip() + '*' - func_name = func.group(2) - arguments = func.group(3) - - func_args = [] - arg_names = [] - args = arguments.split(',') - for arg in args: - arg = arg.strip() - if arg: - func_args.append(arg) - - split_args = arg.split('=') - arg_name = split_args[0].rsplit(None, 1)[-1] - - reg_arg = re.search(r'[\&\*]*(\w*)', arg_name) - if reg_arg: - arg_names += [reg_arg.group(1)] - - ignore = False - - # The following functions need to be ignored in openswr. - # API change in llvm-5.0 breaks baked autogen files - if ( - (func_name == 'CreateFence' or - func_name == 'CreateAtomicCmpXchg' or - func_name == 'CreateAtomicRMW')): - ignore = True - - # The following functions need to be ignored. - if (func_name == 'CreateInsertNUWNSWBinOp' or - func_name == 'CreateMaskedIntrinsic' or - func_name == 'CreateAlignmentAssumptionHelper' or - func_name == 'CreateGEP' or - func_name == 'CreateLoad' or - func_name == 'CreateMaskedLoad' or - func_name == 'CreateStore' or - func_name == 'CreateMaskedStore' or - func_name == 'CreateFCmpHelper' or - func_name == 'CreateElementUnorderedAtomicMemCpy'): - ignore = True - - # Convert CamelCase to CAMEL_CASE - func_mod = re.search(r'Create(\w*)', func_name) - if func_mod: - func_mod = func_mod.group(1) - func_mod = convert_uppercamel(func_mod) - if func_mod[0:2] == 'F_' or func_mod[0:2] == 'I_': - func_mod = func_mod[0] + func_mod[2:] - - # Substitute alias based on CAMEL_CASE name. - func_alias = inst_aliases.get(func_mod) - if not func_alias: - func_alias = func_mod - - if func_name == 'CreateCall' or func_name == 'CreateGEP': - arglist = re.search(r'ArrayRef', ', '.join(func_args)) - if arglist: - func_alias = func_alias + 'A' - - if not ignore: - functions.append({ - 'name' : func_name, - 'alias' : func_alias, - 'return' : return_type, - 'args' : ', '.join(func_args), - 'arg_names' : arg_names, - }) - - return functions - -''' - Auto-generates macros for LLVM IR -''' -def generate_gen_h(functions, output_dir): - filename = 'gen_builder.hpp' - output_filename = os.path.join(output_dir, filename) - - templfuncs = [] - for func in functions: - decl = '%s %s(%s)' % (func['return'], func['alias'], func['args']) - - templfuncs.append({ - 'decl' : decl, - 'intrin' : func['name'], - 'args' : func['arg_names'], - }) - - MakoTemplateWriter.to_file( - template, - output_filename, - cmdline=sys.argv, - comment='Builder IR Wrappers', - filename=filename, - functions=templfuncs, - isX86=False, isIntrin=False) - -''' - Auto-generates macros for LLVM IR -''' -def generate_meta_h(output_dir): - filename = 'gen_builder_meta.hpp' - output_filename = os.path.join(output_dir, filename) - - functions = [] - for inst in intrinsics: - name = inst[0] - args = inst[1] - ret = inst[2] - - #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2]))) - if len(args) != 0: - declargs = 'Value* ' + ', Value* '.join(args) - decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (name, declargs) - else: - decl = 'Value* %s(const llvm::Twine& name = "")' % (name) - - # determine the return type of the intrinsic. It can either be: - # - type of one of the input arguments - # - snippet of code to set the return type - - if ret in args: - returnTy = ret + '->getType()' - else: - returnTy = ret - - functions.append({ - 'decl' : decl, - 'name' : name, - 'args' : args, - 'returnType': returnTy - }) - - MakoTemplateWriter.to_file( - template, - output_filename, - cmdline=sys.argv, - comment='meta intrinsics', - filename=filename, - functions=functions, - isX86=True, isIntrin=False) - -def generate_intrin_h(output_dir): - filename = 'gen_builder_intrin.hpp' - output_filename = os.path.join(output_dir, filename) - - functions = [] - for inst in llvm_intrinsics: - #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2]))) - if len(inst[2]) != 0: - declargs = 'Value* ' + ', Value* '.join(inst[2]) - decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs) - else: - decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0]) - - functions.append({ - 'decl' : decl, - 'intrin' : inst[1], - 'args' : inst[2], - 'types' : inst[3], - }) - - MakoTemplateWriter.to_file( - template, - output_filename, - cmdline=sys.argv, - comment='llvm intrinsics', - filename=filename, - functions=functions, - isX86=False, isIntrin=True) -''' - Function which is invoked when this script is started from a command line. - Will present and consume a set of arguments which will tell this script how - to behave -''' -def main(): - - # Parse args... - parser = ArgumentParser() - parser.add_argument('--input', '-i', type=FileType('r'), help='Path to IRBuilder.h', required=False) - parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True) - parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False) - parser.add_argument('--gen_meta_h', help='Generate meta intrinsics. No input is needed.', action='store_true', default=False) - parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False) - args = parser.parse_args() - - if not os.path.exists(args.output): - os.makedirs(args.output) - - final_output_dir = args.output - args.output = MakeTmpDir('_codegen') - - rval = 0 - try: - if args.input: - functions = parse_ir_builder(args.input) - - if args.gen_h: - generate_gen_h(functions, args.output) - - elif args.gen_h: - print('Need to specify --input for --gen_h!') - - if args.gen_meta_h: - generate_meta_h(args.output) - - if args.gen_intrin_h: - generate_intrin_h(args.output) - - rval = CopyDirFilesIfDifferent(args.output, final_output_dir) - - except: - print('ERROR: Could not generate llvm_ir_macros', file=sys.stderr) - rval = 1 - - finally: - DeleteDirTree(args.output) - - return rval - -if __name__ == '__main__': - sys.exit(main()) -# END OF FILE diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py deleted file mode 100644 index 4739f2078d6..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py +++ /dev/null @@ -1,360 +0,0 @@ -# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. - -import os, sys, re -from gen_common import * -from argparse import FileType - -''' -''' -def gen_llvm_type(type, name, idx, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file): - - llvm_type = '' - - if is_llvm_struct: - if is_pointer or is_pointer_pointer: - llvm_type = 'Type::getInt32Ty(ctx)' - else: - llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type - elif is_llvm_enum: - llvm_type = 'Type::getInt32Ty(ctx)' - elif is_llvm_pfn: - llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)' - else: - if type == 'BYTE' or type == 'char' or type == 'uint8_t' or type == 'int8_t' or type == 'bool': - llvm_type = 'Type::getInt8Ty(ctx)' - elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t' or type == 'gfxptr_t': - llvm_type = 'Type::getInt64Ty(ctx)' - elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t': - llvm_type = 'Type::getInt16Ty(ctx)' - elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t': - llvm_type = 'Type::getInt32Ty(ctx)' - elif type == 'float' or type == 'FLOAT': - llvm_type = 'Type::getFloatTy(ctx)' - elif type == 'double' or type == 'DOUBLE': - llvm_type = 'Type::getDoubleTy(ctx)' - elif type == 'void' or type == 'VOID': - llvm_type = 'Type::getInt32Ty(ctx)' - elif type == 'HANDLE': - llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)' - elif type == 'simdscalar': - llvm_type = 'getVectorType(Type::getFloatTy(ctx), pJitMgr->mVWidth)' - elif type == 'simdscalari': - llvm_type = 'getVectorType(Type::getInt32Ty(ctx), pJitMgr->mVWidth)' - elif type == 'simd16scalar': - llvm_type = 'getVectorType(Type::getFloatTy(ctx), 16)' - elif type == 'simd16scalari': - llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 16)' - elif type == '__m128i': - llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 4)' - elif type == 'SIMD256::Float': - llvm_type = 'getVectorType(Type::getFloatTy(ctx), 8)' - elif type == 'SIMD256::Integer': - llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 8)' - elif type == 'SIMD512::Float': - llvm_type = 'getVectorType(Type::getFloatTy(ctx), 16)' - elif type == 'SIMD512::Integer': - llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 16)' - elif type == 'simdvector': - llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 8), 4)' - elif type == 'simd16vector': - llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 16), 4)' - elif type == 'SIMD256::Vec4': - llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 8), 4)' - elif type == 'SIMD512::Vec4': - llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 16), 4)' - else: - llvm_type = 'Gen_%s(pJitMgr)' % type - - if is_pointer: - llvm_type = 'PointerType::get(%s, 0)' % llvm_type - - if is_pointer_pointer: - llvm_type = 'PointerType::get(%s, 0)' % llvm_type - - if is_array_array: - llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count) - elif is_array: - llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count) - - return { - 'name' : name, - 'lineNum' : idx, - 'type' : llvm_type, - } - -''' -''' -def gen_llvm_types(input_file, output_file): - - lines = input_file.readlines() - - types = [] - - for idx in range(len(lines)): - line = lines[idx].rstrip() - - if 'gen_llvm_types FINI' in line: - break - - match = re.match(r'(\s*)struct(\s*)(\w+)', line) - if match: - llvm_args = [] - - # Detect start of structure - is_fwd_decl = re.search(r';', line) - - if not is_fwd_decl: - - # Extract the command name - struct_name = match.group(3).strip() - - type_entry = { - 'name' : struct_name, - 'lineNum' : idx+1, - 'members' : [], - } - - end_of_struct = False - - while not end_of_struct and idx < len(lines)-1: - idx += 1 - line = lines[idx].rstrip() - - is_llvm_typedef = re.search(r'@llvm_typedef', line) - if is_llvm_typedef is not None: - is_llvm_typedef = True - continue - else: - is_llvm_typedef = False - - ########################################### - # Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure. - is_llvm_struct = re.search(r'@llvm_struct', line) - - if is_llvm_struct is not None: - is_llvm_struct = True - else: - is_llvm_struct = False - - ########################################### - # Is field the start of a function? Tells script to ignore it - is_llvm_func_start = re.search(r'@llvm_func_start', line) - - if is_llvm_func_start is not None: - while not end_of_struct and idx < len(lines)-1: - idx += 1 - line = lines[idx].rstrip() - is_llvm_func_end = re.search(r'@llvm_func_end', line) - if is_llvm_func_end is not None: - break; - continue - - ########################################### - # Is field a function? Tells script to ignore it - is_llvm_func = re.search(r'@llvm_func', line) - - if is_llvm_func is not None: - continue - - ########################################### - # Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type. - is_llvm_enum = re.search(r'@llvm_enum', line) - - if is_llvm_enum is not None: - is_llvm_enum = True - else: - is_llvm_enum = False - - ########################################### - # Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type. - is_llvm_pfn = re.search(r'@llvm_pfn', line) - - if is_llvm_pfn is not None: - is_llvm_pfn = True - else: - is_llvm_pfn = False - - ########################################### - # Is field const? - is_const = re.search(r'\s+const\s+', line) - - if is_const is not None: - is_const = True - else: - is_const = False - - ########################################### - # Is field a pointer? - is_pointer_pointer = re.search('\*\*', line) - - if is_pointer_pointer is not None: - is_pointer_pointer = True - else: - is_pointer_pointer = False - - ########################################### - # Is field a pointer? - is_pointer = re.search('\*', line) - - if is_pointer is not None: - is_pointer = True - else: - is_pointer = False - - ########################################### - # Is field an array of arrays? - # TODO: Can add this to a list. - is_array_array = re.search('\[(\w*)\]\[(\w*)\]', line) - array_count = '0' - array_count1 = '0' - - if is_array_array is not None: - array_count = is_array_array.group(1) - array_count1 = is_array_array.group(2) - is_array_array = True - else: - is_array_array = False - - ########################################### - # Is field an array? - is_array = re.search('\[(\w*)\]', line) - - if is_array is not None: - array_count = is_array.group(1) - is_array = True - else: - is_array = False - - is_scoped = re.search('::', line) - - if is_scoped is not None: - is_scoped = True - else: - is_scoped = False - - type = None - name = None - if is_const and is_pointer: - - if is_scoped: - field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)', line) - - type = '%s%s' % (field_match.group(4), field_match.group(5)) - name = field_match.group(7) - else: - field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)', line) - - type = field_match.group(4) - name = field_match.group(6) - - elif is_pointer: - field_match = re.match(r'(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)', line) - - if field_match: - type = field_match.group(3) - name = field_match.group(5) - elif is_const: - field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)', line) - - if field_match: - type = field_match.group(4) - name = field_match.group(6) - else: - if is_scoped: - field_match = re.match(r'\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)', line) - - if field_match: - type = field_match.group(1) + '::' + field_match.group(2) - name = field_match.group(3) - else: - field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)', line) - - if field_match: - type = field_match.group(2) - name = field_match.group(4) - - if is_llvm_typedef is False: - if type is not None: - type_entry['members'].append( - gen_llvm_type( - type, name, idx+1, is_pointer, is_pointer_pointer, is_array, is_array_array, - array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file)) - - # Detect end of structure - end_of_struct = re.match(r'(\s*)};', line) - - if end_of_struct: - types.append(type_entry) - - cur_dir = os.path.dirname(os.path.abspath(__file__)) - template = os.path.join(cur_dir, 'templates', 'gen_llvm.hpp') - - MakoTemplateWriter.to_file( - template, - output_file, - cmdline=sys.argv, - filename=os.path.basename(output_file), - types=types, - input_dir=os.path.dirname(input_file.name), - input_file=os.path.basename(input_file.name)) - -''' - Function which is invoked when this script is started from a command line. - Will present and consume a set of arguments which will tell this script how - to behave -''' -def main(): - - # Parse args... - parser = ArgumentParser() - parser.add_argument('--input', '-i', type=FileType('r'), - help='Path to input file containing structs', required=True) - parser.add_argument('--output', '-o', action='store', - help='Path to output file', required=True) - args = parser.parse_args() - - final_output_dir = os.path.dirname(args.output) - if MakeDir(final_output_dir): - return 1 - - final_output_file = args.output - - tmp_dir = MakeTmpDir('_codegen') - args.output = os.path.join(tmp_dir, os.path.basename(args.output)) - - rval = 0 - try: - gen_llvm_types(args.input, args.output) - - rval = CopyFileIfDifferent(args.output, final_output_file) - except: - print('ERROR: Could not generate llvm types', file=sys.stderr) - rval = 1 - - finally: - DeleteDirTree(tmp_dir) - - return rval - -if __name__ == '__main__': - sys.exit(main()) -# END OF FILE diff --git a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py deleted file mode 100644 index 75eae353ae1..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py +++ /dev/null @@ -1,383 +0,0 @@ -# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice (including the next -# paragraph) shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. -import sys - -# Python source -KNOBS = [ - - ['ENABLE_ASSERT_DIALOGS', { - 'type' : 'bool', - 'default' : 'true', - 'desc' : ['Use dialogs when asserts fire.', - 'Asserts are only enabled in debug builds'], - 'category' : 'debug', - }], - - ['SINGLE_THREADED', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['If enabled will perform all rendering on the API thread.', - 'This is useful mainly for debugging purposes.'], - 'category' : 'debug', - }], - - ['DUMP_SHADER_IR', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'], - 'category' : 'debug', - }], - - ['USE_GENERIC_STORETILE', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Always use generic function for performing StoreTile.', - 'Will be slightly slower than using optimized (jitted) path'], - 'category' : 'debug_adv', - }], - - ['FAST_CLEAR', { - 'type' : 'bool', - 'default' : 'true', - 'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and', - 'defer clear execution to first backend op on hottile, or hottile store'], - 'category' : 'perf_adv', - }], - - ['MAX_NUMA_NODES', { - 'type' : 'uint32_t', - 'default' : '1' if sys.platform == 'win32' else '0', - 'desc' : ['Maximum # of NUMA-nodes per system used for worker threads', - ' 0 == ALL NUMA-nodes in the system', - ' N == Use at most N NUMA-nodes for rendering'], - 'category' : 'perf', - }], - - ['MAX_CORES_PER_NUMA_NODE', { - 'type' : 'uint32_t', - 'default' : '0', - 'desc' : ['Maximum # of cores per NUMA-node used for worker threads.', - ' 0 == ALL non-API thread cores per NUMA-node', - ' N == Use at most N cores per NUMA-node'], - 'category' : 'perf', - }], - - ['MAX_THREADS_PER_CORE', { - 'type' : 'uint32_t', - 'default' : '1', - 'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.', - ' 0 == ALL hyper-threads per core', - ' N == Use at most N hyper-threads per physical core'], - 'category' : 'perf', - }], - - ['MAX_WORKER_THREADS', { - 'type' : 'uint32_t', - 'default' : '0', - 'desc' : ['Maximum worker threads to spawn.', - '', - 'IMPORTANT: If this is non-zero, no worker threads will be bound to', - 'specific HW threads. They will all be "floating" SW threads.', - 'In this case, the above 3 KNOBS will be ignored.'], - 'category' : 'perf', - }], - - ['BASE_NUMA_NODE', { - 'type' : 'uint32_t', - 'default' : '0', - 'desc' : ['Starting NUMA node index to use when allocating compute resources.', - 'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'], - 'category' : 'perf', - }], - - ['BASE_CORE', { - 'type' : 'uint32_t', - 'default' : '0', - 'desc' : ['Starting core index to use when allocating compute resources.', - 'Setting this to a non-zero value will reduce the maximum # of cores used.'], - 'category' : 'perf', - }], - - ['BASE_THREAD', { - 'type' : 'uint32_t', - 'default' : '0', - 'desc' : ['Starting thread index to use when allocating compute resources.', - 'Setting this to a non-zero value will reduce the maximum # of threads used.'], - 'category' : 'perf', - }], - - ['BUCKETS_START_FRAME', { - 'type' : 'uint32_t', - 'default' : '1200', - 'desc' : ['Frame from when to start saving buckets data.', - '', - 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', - 'for this to have an effect.'], - 'category' : 'perf_adv', - }], - - ['BUCKETS_END_FRAME', { - 'type' : 'uint32_t', - 'default' : '1400', - 'desc' : ['Frame at which to stop saving buckets data.', - '', - 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', - 'for this to have an effect.'], - 'category' : 'perf_adv', - }], - - ['WORKER_SPIN_LOOP_COUNT', { - 'type' : 'uint32_t', - 'default' : '5000', - 'desc' : ['Number of spin-loop iterations worker threads will perform', - 'before going to sleep when waiting for work'], - 'category' : 'perf_adv', - }], - - ['MAX_DRAWS_IN_FLIGHT', { - 'type' : 'uint32_t', - 'default' : '256', - 'desc' : ['Maximum number of draws outstanding before API thread blocks.', - 'This value MUST be evenly divisible into 2^32'], - 'category' : 'perf_adv', - }], - - ['MAX_PRIMS_PER_DRAW', { - 'type' : 'uint32_t', - 'default' : '49152', - 'desc' : ['Maximum primitives in a single Draw().', - 'Larger primitives are split into smaller Draw calls.', - 'Should be a multiple of (3 * vectorWidth).'], - 'category' : 'perf_adv', - }], - - ['MAX_TESS_PRIMS_PER_DRAW', { - 'type' : 'uint32_t', - 'default' : '16', - 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.', - 'Larger primitives are split into smaller Draw calls.', - 'Should be a multiple of (vectorWidth).'], - 'category' : 'perf_adv', - }], - - - ['DEBUG_OUTPUT_DIR', { - 'type' : 'std::string', - 'default' : r'%TEMP%\Rast\DebugOutput' if sys.platform == 'win32' else '/tmp/Rast/DebugOutput', - 'desc' : ['Output directory for debug data.'], - 'category' : 'debug', - }], - - ['JIT_ENABLE_CACHE', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Enables caching of compiled shaders'], - 'category' : 'debug_adv', - }], - - ['JIT_OPTIMIZATION_LEVEL', { - 'type' : 'int', - 'default' : '-1', - 'desc' : ['JIT compile optimization level:',], - 'category' : 'debug', - 'control' : 'dropdown', - 'choices' : [ - { - 'name' : 'Automatic', - 'desc' : 'Automatic based on other KNOB and build settings', - 'value' : -1, - }, - { - 'name' : 'Debug', - 'desc' : 'No optimization: -O0', - 'value' : 0, - }, - { - 'name' : 'Less', - 'desc' : 'Some optimization: -O1', - 'value' : 1, - }, - { - 'name' : 'Optimize', - 'desc' : 'Default Clang / LLVM optimizations: -O2', - 'value' : 2, - }, - { - 'name' : 'Aggressive', - 'desc' : 'Maximum optimization: -O3', - 'value' : 3, - }, - ], - }], - - ['JIT_CACHE_DIR', { - 'type' : 'std::string', - 'default' : r'%TEMP%\SWR\JitCache' if sys.platform == 'win32' else '${HOME}/.swr/jitcache', - 'desc' : ['Cache directory for compiled shaders.'], - 'category' : 'debug', - }], - - ['TOSS_DRAW', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Disable per-draw/dispatch execution'], - 'category' : 'perf', - }], - - ['TOSS_QUEUE_FE', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Stop per-draw execution at worker FE', - '', - 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], - 'category' : 'perf_adv', - }], - - ['TOSS_FETCH', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Stop per-draw execution at vertex fetch', - '', - 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], - 'category' : 'perf_adv', - }], - - ['TOSS_IA', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Stop per-draw execution at input assembler', - '', - 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], - 'category' : 'perf_adv', - }], - - ['TOSS_VS', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Stop per-draw execution at vertex shader', - '', - 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], - 'category' : 'perf_adv', - }], - - ['TOSS_SETUP_TRIS', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Stop per-draw execution at primitive setup', - '', - 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], - 'category' : 'perf_adv', - }], - - ['TOSS_BIN_TRIS', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Stop per-draw execution at primitive binning', - '', - 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], - 'category' : 'perf_adv', - }], - - ['TOSS_RS', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Stop per-draw execution at rasterizer', - '', - 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], - 'category' : 'perf_adv', - }], - - ['DISABLE_SPLIT_DRAW', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Don\'t split large draws into smaller draws.,', - 'MAX_PRIMS_PER_DRAW and MAX_TESS_PRIMS_PER_DRAW can be used to control split size.', - '', - 'Useful to disable split draws for gathering archrast stats.'], - 'category' : 'perf_adv', - }], - - ['AR_ENABLE_PIPELINE_STATS', { - 'type' : 'bool', - 'default' : 'true', - 'desc' : ['Enable pipeline stats when using Archrast'], - 'category' : 'archrast', - }], - - ['AR_ENABLE_SHADER_STATS', { - 'type' : 'bool', - 'default' : 'true', - 'desc' : ['Enable shader stats when using Archrast'], - 'category' : 'archrast', - }], - - ['AR_ENABLE_SWTAG_DATA', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Enable SWTag data when using Archrast'], - 'category' : 'archrast', - }], - - ['AR_ENABLE_SWR_EVENTS', { - 'type' : 'bool', - 'default' : 'true', - 'desc' : ['Enable internal SWR events when using Archrast'], - 'category' : 'archrast', - }], - - ['AR_ENABLE_PIPELINE_EVENTS', { - 'type' : 'bool', - 'default' : 'true', - 'desc' : ['Enable pipeline events when using Archrast'], - 'category' : 'archrast', - }], - - ['AR_ENABLE_SHADER_EVENTS', { - 'type' : 'bool', - 'default' : 'true', - 'desc' : ['Enable shader events when using Archrast'], - 'category' : 'archrast', - }], - - ['AR_ENABLE_SWTAG_EVENTS', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Enable SWTag events when using Archrast'], - 'category' : 'archrast', - }], - - ['AR_ENABLE_MEMORY_EVENTS', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Enable memory events when using Archrast'], - 'category' : 'archrast', - }], - - ['AR_MEM_SET_BYTE_GRANULARITY', { - 'type' : 'uint32_t', - 'default' : '64', - 'desc' : ['Granularity and alignment of tracking of memory accesses', - 'ONLY ACTIVE UNDER ArchRast.'], - 'category' : 'archrast', - }], - - - ] diff --git a/src/gallium/drivers/swr/rasterizer/codegen/meson.build b/src/gallium/drivers/swr/rasterizer/codegen/meson.build deleted file mode 100644 index daf79ed4c26..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/meson.build +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright © 2017-2018 Intel Corporation - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -gen_knobs_cpp = custom_target( - 'gen_knobs.cpp', - input : ['gen_knobs.py'], - output : 'gen_knobs.cpp', - command : [prog_python, '@INPUT0@', '--output', '@OUTPUT@', '--gen_cpp'], - depend_files : files( - 'knob_defs.py', 'gen_common.py', - 'templates/gen_knobs.cpp', - ), -) - -gen_knobs_h = custom_target( - 'gen_knobs.h', - input : ['gen_knobs.py'], - output : 'gen_knobs.h', - command : [prog_python, '@INPUT0@', '--output', '@OUTPUT@', '--gen_h'], - depend_files : files( - 'knob_defs.py', 'gen_common.py', - 'templates/gen_knobs.h', - ), -) - - -# The generators above this are needed individually, while the below generators -# are all inputs to the same lib, so they don't need unique names. -files_swr_common += [ - gen_builder_hpp, gen_builder_meta_hpp, gen_knobs_h, gen_knobs_cpp -] - -foreach x : [[swr_context_files, 'gen_swr_context_llvm.h'], - [swr_state_files, 'gen_state_llvm.h'], - [swr_surf_state_files, 'gen_surf_state_llvm.h']] - files_swr_common += custom_target( - x[1], - input : ['gen_llvm_types.py', x[0]], - output : x[1], - command : [prog_python, '@INPUT0@', '--input', '@INPUT1@', '--output', '@OUTPUT@'], - depend_files : files( - 'templates/gen_llvm.hpp', - 'gen_common.py', - ), - ) -endforeach - -ar_output_filenames = ['gen_ar_event.hpp', 'gen_ar_event.cpp', 'gen_ar_eventhandler.hpp', 'gen_ar_eventhandlerfile.hpp'] -ar_template_filenames = [] -foreach fname : ar_output_filenames - ar_template_filenames += join_paths('templates', fname) -endforeach - -files_swr_common += custom_target( - 'gen_archrast', - input : ['gen_archrast.py', swr_event_proto_files, swr_event_pproto_files], - output : ar_output_filenames, - command : [prog_python, '@INPUT0@', '--proto', '@INPUT1@', '@INPUT2@', '--output-dir', meson.current_build_dir()], - depend_files : files('gen_common.py', ar_template_filenames) -) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp deleted file mode 100644 index e73a8110ee1..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file ${filename} - * - * @brief Implementation for events. auto-generated file - * - * DO NOT EDIT - * - * Generation Command Line: - * ${'\n * '.join(cmdline)} - * - ******************************************************************************/ -// clang-format off -#include "common/os.h" -#include "gen_ar_event.hpp" -#include "gen_ar_eventhandler.hpp" - -using namespace ArchRast; - -<% sorted_groups = sorted(protos['events']['groups']) %> -% for group in sorted_groups: -% for event_key in protos['events']['groups'][group]: -<% - event = protos['events']['defs'][event_key] -%> -void ${event['name']}::Accept(EventHandler* pHandler) const -{ - pHandler->Handle(*this); -} -% endfor -% endfor - - -// clan-format on - diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp deleted file mode 100644 index 3ef99da2249..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp +++ /dev/null @@ -1,168 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file ${filename} - * - * @brief Definitions for events. auto-generated file - * - * DO NOT EDIT - * - * Generation Command Line: - * ${'\n * '.join(cmdline)} - * - ******************************************************************************/ -// clang-format off -#pragma once - -#include "common/os.h" -#include "core/state.h" - -<% - always_enabled_knob_groups = ['Framework', 'SWTagFramework', 'ApiSwr'] - group_knob_remap_table = { - "ShaderStats": "KNOB_AR_ENABLE_SHADER_STATS", - "PipelineStats" : "KNOB_AR_ENABLE_PIPELINE_STATS", - "SWTagData" : "KNOB_AR_ENABLE_SWTAG_DATA", - } -%> -namespace ArchRast -{ -<% sorted_enums = sorted(protos['enums']['defs']) %> -% for name in sorted_enums: - enum ${name} - {<% names = protos['enums']['defs'][name]['names'] %> - % for i in range(len(names)): - ${names[i].lstrip()} - % endfor - }; -% endfor - - // Forward decl - class EventHandler; - - ////////////////////////////////////////////////////////////////////////// - /// Event - interface for handling events. - ////////////////////////////////////////////////////////////////////////// - struct Event - { - const uint32_t eventId = {0xFFFFFFFF}; - Event() {} - virtual ~Event() {} - - virtual bool IsEnabled() const { return true; }; - virtual const uint32_t GetEventId() const = 0; - virtual void Accept(EventHandler* pHandler) const = 0; - }; - -<% sorted_groups = sorted(protos['events']['groups']) %> -% for group in sorted_groups: - % for event_key in protos['events']['groups'][group]: -<% - event = protos['events']['defs'][event_key] -%> - ////////////////////////////////////////////////////////////////////////// - /// ${event_key}Data - ////////////////////////////////////////////////////////////////////////// -#pragma pack(push, 1) - struct ${event['name']}Data - {<% - fields = event['fields'] %> - // Fields - % for i in range(len(fields)): - % if fields[i]['size'] > 1: - ${fields[i]['type']} ${fields[i]['name']}[${fields[i]['size']}]; - % else: - ${fields[i]['type']} ${fields[i]['name']}; - % endif - % endfor - }; -#pragma pack(pop) - - ////////////////////////////////////////////////////////////////////////// - /// ${event_key} - ////////////////////////////////////////////////////////////////////////// - struct ${event['name']} : Event - {<% - fields = event['fields'] %> - const uint32_t eventId = {${ event['id'] }}; - ${event['name']}Data data; - - // Constructor - ${event['name']}( - % for i in range(len(fields)): - % if i < len(fields)-1: - % if fields[i]['size'] > 1: - ${fields[i]['type']}* ${fields[i]['name']}, - uint32_t ${fields[i]['name']}_size, - % else: - ${fields[i]['type']} ${fields[i]['name']}, - % endif - % endif - % if i == len(fields)-1: - % if fields[i]['size'] > 1: - ${fields[i]['type']}* ${fields[i]['name']}, - uint32_t ${fields[i]['name']}_size - % else: - ${fields[i]['type']} ${fields[i]['name']} - % endif - % endif - % endfor - ) - { - % for i in range(len(fields)): - % if fields[i]['size'] > 1: - % if fields[i]['type'] == 'char': - // Copy size of string (null-terminated) followed by string into entire buffer - SWR_ASSERT(${fields[i]['name']}_size + 1 < ${fields[i]['size']} - sizeof(uint32_t), "String length must be less than size of char buffer - size(uint32_t)!"); - memcpy(data.${fields[i]['name']}, &${fields[i]['name']}_size, sizeof(uint32_t)); - strcpy_s(data.${fields[i]['name']} + sizeof(uint32_t), ${fields[i]['name']}_size + 1, ${fields[i]['name']}); - % else: - memcpy(data.${fields[i]['name']}, ${fields[i]['name']}, ${fields[i]['name']}_size); - % endif - % else: - data.${fields[i]['name']} = ${fields[i]['name']}; - % endif - % endfor - } - - virtual void Accept(EventHandler* pHandler) const; - inline const uint32_t GetEventId() const { return eventId; } - % if group not in always_enabled_knob_groups: - <% - if group in group_knob_remap_table: - group_knob_define = group_knob_remap_table[group] - else: - group_knob_define = 'KNOB_AR_ENABLE_' + group.upper() + '_EVENTS' - %> - bool IsEnabled() const - { - static const bool IsEventEnabled = true; // TODO: Replace with knob for each event - return ${group_knob_define} && IsEventEnabled; - } - % endif - }; - - % endfor - -% endfor -} // namespace ArchRast -// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp deleted file mode 100644 index d3e82e8a4ee..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file ${filename} - * - * @brief Event handler interface. auto-generated file - * - * DO NOT EDIT - * - * Generation Command Line: - * ${'\n * '.join(cmdline)} - * - ******************************************************************************/ -// clang-format on -#pragma once - -#include "${event_header}" - -namespace ArchRast -{ - ////////////////////////////////////////////////////////////////////////// - /// EventHandler - interface for handling events. - ////////////////////////////////////////////////////////////////////////// - class EventHandler - { - public: - EventHandler() {} - virtual ~EventHandler() {} - - virtual void FlushDraw(uint32_t drawId) {} - -<% sorted_groups = sorted(protos['events']['groups']) %> -% for group in sorted_groups: -% for event_key in protos['events']['groups'][group]: -<% - event = protos['events']['defs'][event_key] -%> virtual void Handle(const ${event['name']}& event) {} -% endfor -% endfor - }; -} // namespace ArchRast -// clan-format off diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp deleted file mode 100644 index ba5a51700f3..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp +++ /dev/null @@ -1,174 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file ${filename} - * - * @brief Event handler interface. auto-generated file - * - * DO NOT EDIT - * - * Generation Command Line: - * ${'\n * '.join(cmdline)} - * - ******************************************************************************/ -// clang-format off -#pragma once - -#include "common/os.h" -#include "${event_header}" -#include <fstream> -#include <sstream> -#include <iostream> -#include <thread> - -namespace ArchRast -{ - ////////////////////////////////////////////////////////////////////////// - /// EventHandlerFile - interface for handling events. - ////////////////////////////////////////////////////////////////////////// - class EventHandlerFile : public EventHandler - { - public: - EventHandlerFile(uint32_t id) : mBufOffset(0) - { -#if defined(_WIN32) - DWORD pid = GetCurrentProcessId(); - TCHAR procname[MAX_PATH]; - GetModuleFileName(NULL, procname, MAX_PATH); - const char* pBaseName = strrchr(procname, '\\'); - std::stringstream outDir; - outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << std::ends; - mOutputDir = outDir.str(); - if (CreateDirectory(mOutputDir.c_str(), NULL)) - { - std::cout << std::endl - << "ArchRast Dir: " << mOutputDir << std::endl - << std::endl - << std::flush; - } - - // There could be multiple threads creating thread pools. We - // want to make sure they are uniquely identified by adding in - // the creator's thread id into the filename. - std::stringstream fstr; - fstr << outDir.str().c_str() << "\\ar_event" << std::this_thread::get_id(); - fstr << "_" << id << ".bin" << std::ends; - mFilename = fstr.str(); -#else - // There could be multiple threads creating thread pools. We - // want to make sure they are uniquely identified by adding in - // the creator's thread id into the filename. - std::stringstream fstr; - fstr << "/tmp/ar_event" << std::this_thread::get_id(); - fstr << "_" << id << ".bin" << std::ends; - mFilename = fstr.str(); -#endif - } - - virtual ~EventHandlerFile() { FlushBuffer(); } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Flush buffer to file. - bool FlushBuffer() - { - if (mBufOffset > 0) - { - if (mBufOffset == mHeaderBufOffset) - { - // Nothing to flush. Only header has been generated. - return false; - } - - std::ofstream file; - file.open(mFilename, std::ios::out | std::ios::app | std::ios::binary); - - if (!file.is_open()) - { - SWR_INVALID("ArchRast: Could not open event file!"); - return false; - } - - file.write((char*)mBuffer, mBufOffset); - file.close(); - - mBufOffset = 0; - mHeaderBufOffset = 0; // Reset header offset so its no longer considered. - } - return true; - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Write event and its payload to the memory buffer. - void Write(uint32_t eventId, const char* pBlock, uint32_t size) - { - if ((mBufOffset + size + sizeof(eventId)) > mBufferSize) - { - if (!FlushBuffer()) - { - // Don't corrupt what's already in the buffer? - /// @todo Maybe add corrupt marker to buffer here in case we can open file in - /// future? - return; - } - } - - memcpy(&mBuffer[mBufOffset], (char*)&eventId, sizeof(eventId)); - mBufOffset += sizeof(eventId); - memcpy(&mBuffer[mBufOffset], pBlock, size); - mBufOffset += size; - } -<% sorted_groups = sorted(protos['events']['groups']) %> -% for group in sorted_groups: -% for event_key in protos['events']['groups'][group]: -<% - event = protos['events']['defs'][event_key] -%> - ////////////////////////////////////////////////////////////////////////// - /// @brief Handle ${event_key} event - virtual void Handle(const ${event['name']}& event) - { -% if event['num_fields'] == 0: - Write(event.eventId, (char*)&event.data, 0); -% else: - Write(event.eventId, (char*)&event.data, sizeof(event.data)); -% endif - } -% endfor -% endfor - - ////////////////////////////////////////////////////////////////////////// - /// @brief Everything written to buffer this point is the header. - virtual void MarkHeader() - { - mHeaderBufOffset = mBufOffset; - } - - std::string mFilename; - std::string mOutputDir; - - static const uint32_t mBufferSize = 1024; - uint8_t mBuffer[mBufferSize]; - uint32_t mBufOffset{0}; - uint32_t mHeaderBufOffset{0}; - }; -} // namespace ArchRast -// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp deleted file mode 100644 index b8da5298f3d..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp +++ /dev/null @@ -1,42 +0,0 @@ -//============================================================================ -// Copyright (C) 2017 Intel Corporation. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a -// copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice (including the next -// paragraph) shall be included in all copies or substantial portions of the -// Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -// IN THE SOFTWARE. -// -// @file BackendPixelRate${fileNum}.cpp -// -// @brief auto-generated file -// -// DO NOT EDIT -// -// Generation Command Line: -// ${'\n// '.join(cmdline)} -// -//============================================================================ - -#include "core/backend.h" -#include "core/backend_impl.h" - -void InitBackendPixelRate${fileNum}() -{ - %for func in funcList: - ${func} - %endfor -} diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp deleted file mode 100644 index da1ca87620a..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp +++ /dev/null @@ -1,84 +0,0 @@ -//============================================================================ -// Copyright (C) 2014-2020 Intel Corporation. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a -// copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice (including the next -// paragraph) shall be included in all copies or substantial portions of the -// Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -// IN THE SOFTWARE. -// -// @file ${filename} -// -// @brief auto-generated file -// -// DO NOT EDIT -// -// Generation Command Line: -// ${'\n// '.join(cmdline)} -// -//============================================================================ -// clang-format off -#pragma once - -//============================================================================ -// Auto-generated ${comment} -//============================================================================ -%for func in functions: -<%argList = ', '.join(func['args'])%>\ -${func['decl']} -{ -%if isX86: - %if len(func['args']) != 0: - SmallVector<Type*, ${len(func['args'])}> argTypes; - %for arg in func['args']: - argTypes.push_back(${arg}->getType()); - %endfor -#if LLVM_VERSION_MAJOR >= 12 - #define VEC_GET_NUM_ELEMS cast<FixedVectorType>(a->getType())->getNumElements() -#elif LLVM_VERSION_MAJOR >= 11 - #define VEC_GET_NUM_ELEMS cast<VectorType>(a->getType())->getNumElements() -#else - #define VEC_GET_NUM_ELEMS a->getType()->getVectorNumElements() -#endif - FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, argTypes, false); - %else: - FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, {}, false); - %endif: -#if LLVM_VERSION_MAJOR >= 9 - Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy).getCallee()); -#else - Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy)); -#endif - return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name); -%elif isIntrin: - %if len(func['types']) != 0: - SmallVector<Type*, ${len(func['types'])}> args; - %for arg in func['types']: - args.push_back(${arg}->getType()); - %endfor - Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args); - return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name); - %else: - Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}); - return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name); - %endif -%else: - return IRB()->${func['intrin']}(${argList}); -%endif -} - -% endfor - // clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp deleted file mode 100644 index d0682c55f03..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp +++ /dev/null @@ -1,46 +0,0 @@ -//============================================================================ -// Copyright (C) 2017 Intel Corporation. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a -// copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice (including the next -// paragraph) shall be included in all copies or substantial portions of the -// Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -// IN THE SOFTWARE. -// -// @file ${filename} -// -// @brief auto-generated file -// -// DO NOT EDIT -// -// Generation Command Line: -// ${'\n// '.join(cmdline)} -// -//============================================================================ - -// clang-format off - -%for num in range(numFiles): -void Init${tableName}${num}(); -%endfor - -static INLINE void Init${tableName}() -{ - %for num in range(numFiles): - Init${tableName}${num}(); - %endfor -} -// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp deleted file mode 100644 index 194499aa1e0..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp +++ /dev/null @@ -1,143 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2015-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file ${filename}.cpp - * - * @brief Dynamic Knobs for Core. - * - * ======================= AUTO GENERATED: DO NOT EDIT !!! ==================== - * - * Generation Command Line: - * ${'\n * '.join(cmdline)} - * - ******************************************************************************/ -// clang-format off -<% calc_max_knob_len(knobs) %> -% for inc in includes: -#include <${inc}> -% endfor -#include <regex> -#include <core/utils.h> - -//======================================================== -// Implementation -//======================================================== -void KnobBase::autoExpandEnvironmentVariables(std::string& text) -{ - size_t start; - while ((start = text.find("${'${'}")) != std::string::npos) - { - size_t end = text.find("}"); - if (end == std::string::npos) - break; - const std::string var = GetEnv(text.substr(start + 2, end - start - 2)); - text.replace(start, end - start + 1, var); - } - // win32 style variable replacement - while ((start = text.find("%")) != std::string::npos) - { - size_t end = text.find("%", start + 1); - if (end == std::string::npos) - break; - const std::string var = GetEnv(text.substr(start + 1, end - start - 1)); - text.replace(start, end - start + 1, var); - } -} - -//======================================================== -// Static Data Members -//======================================================== -% for knob in knobs: -% if knob[1]['type'] == 'std::string': -${knob[1]['type']} GlobalKnobs::Knob_${knob[0]}::m_default = "${repr(knob[1]['default'])[1:-1]}"; -% else: -${knob[1]['type']} GlobalKnobs::Knob_${knob[0]}::m_default = ${knob[1]['default']}; -% endif -% endfor -GlobalKnobs g_GlobalKnobs; - -//======================================================== -// Knob Initialization -//======================================================== -GlobalKnobs::GlobalKnobs() -{ - % for knob in knobs : - InitKnob(${ knob[0] }); - % endfor -} - -//======================================================== -// Knob Display (Convert to String) -//======================================================== -std::string GlobalKnobs::ToString(const char* optPerLinePrefix) -{ - std::basic_stringstream<char> str; - str << std::showbase << std::setprecision(1) << std::fixed; - - if (optPerLinePrefix == nullptr) - { - optPerLinePrefix = ""; - } - - % for knob in knobs: - str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}"; - % if knob[1]['type'] == 'bool': - str << (KNOB_${knob[0]} ? "+\n" : "-\n"); - % elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string': - str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]}; - str << std::dec << KNOB_${knob[0]} << "\n"; - % else: - str << KNOB_${knob[0]} << "\n"; - % endif - % endfor - str << std::ends; - - return str.str(); -} -<%! - # Globally available python - max_len = 0 - def calc_max_knob_len(knobs): - global max_len - max_len = 0 - for knob in knobs: - if len(knob[0]) > max_len: max_len = len(knob[0]) - max_len += len('KNOB_ ') - if max_len % 4: max_len += 4 - (max_len % 4) - - def space_knob(knob): - knob_len = len('KNOB_' + knob) - return ' '*(max_len - knob_len) - - def calc_max_name_len(choices_array): - _max_len = 0 - for choice in choices_array: - if len(choice['name']) > _max_len: _max_len = len(choice['name']) - - if _max_len % 4: _max_len += 4 - (_max_len % 4) - return _max_len - - def space_name(name, max_len): - name_len = len(name) - return ' '*(max_len - name_len) -%> -// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h deleted file mode 100644 index 8b88a11706c..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h +++ /dev/null @@ -1,154 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2015-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file ${filename}.h - * - * @brief Dynamic Knobs for Core. - * - * ======================= AUTO GENERATED: DO NOT EDIT !!! ==================== - * - * Generation Command Line: - * ${'\n * '.join(cmdline)} - * - ******************************************************************************/ -// clang-format off -<% calc_max_knob_len(knobs) %> -#pragma once -#include <string> - -struct KnobBase -{ -private: - // Update the input string. - static void autoExpandEnvironmentVariables(std::string& text); - -protected: - // Leave input alone and return new string. - static std::string expandEnvironmentVariables(std::string const& input) - { - std::string text = input; - autoExpandEnvironmentVariables(text); - return text; - } - - template <typename T> - static T expandEnvironmentVariables(T const& input) - { - return input; - } -}; - -template <typename T> -struct Knob : KnobBase -{ -public: - const T& Value() const { return m_Value; } - const T& Value(T const& newValue) - { - m_Value = expandEnvironmentVariables(newValue); - return Value(); - } - -private: - T m_Value; -}; - -#define DEFINE_KNOB(_name, _type) \\ - - struct Knob_##_name : Knob<_type> \\ - - { \\ - - static const char* Name() { return "KNOB_" #_name; } \\ - - static _type DefaultValue() { return (m_default); } \\ - - private: \\ - - static _type m_default; \\ - - } _name; - -#define GET_KNOB(_name) g_GlobalKnobs._name.Value() -#define SET_KNOB(_name, _newValue) g_GlobalKnobs._name.Value(_newValue) - -struct GlobalKnobs -{ - % for knob in knobs: - //----------------------------------------------------------- - // KNOB_${knob[0]} - // - % for line in knob[1]['desc']: - // ${line} - % endfor - % if knob[1].get('choices'): - <% - choices = knob[1].get('choices') - _max_len = calc_max_name_len(choices) %>// - % for i in range(len(choices)): - // ${choices[i]['name']}${space_name(choices[i]['name'], _max_len)} = ${format(choices[i]['value'], '#010x')} - % endfor - % endif - // - DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}); - - % endfor - - std::string ToString(const char* optPerLinePrefix=""); - GlobalKnobs(); -}; -extern GlobalKnobs g_GlobalKnobs; - -#undef DEFINE_KNOB - -% for knob in knobs: -#define KNOB_${knob[0]}${space_knob(knob[0])} GET_KNOB(${knob[0]}) -% endfor - -<%! - # Globally available python - max_len = 0 - def calc_max_knob_len(knobs): - global max_len - max_len = 0 - for knob in knobs: - if len(knob[0]) > max_len: max_len = len(knob[0]) - max_len += len('KNOB_ ') - if max_len % 4: max_len += 4 - (max_len % 4) - - def space_knob(knob): - knob_len = len('KNOB_' + knob) - return ' '*(max_len - knob_len) - - def calc_max_name_len(choices_array): - _max_len = 0 - for choice in choices_array: - if len(choice['name']) > _max_len: _max_len = len(choice['name']) - - if _max_len % 4: _max_len += 4 - (_max_len % 4) - return _max_len - - def space_name(name, max_len): - name_len = len(name) - return ' '*(max_len - name_len) -%> -// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp deleted file mode 100644 index 99a3f300bba..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp +++ /dev/null @@ -1,109 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file ${filename} - * - * @brief auto-generated file - * - * DO NOT EDIT - * - * Generation Command Line: - * ${'\n * '.join(cmdline)} - * - ******************************************************************************/ -// clang-format off - -#include <llvm/IR/DerivedTypes.h> - -#pragma once - -namespace SwrJit -{ - using namespace llvm; - -%for type in types: - INLINE static StructType* Gen_${type['name']}(JitManager* pJitMgr) - { - %if needs_ctx(type): - LLVMContext& ctx = pJitMgr->mContext; - - %endif -#if LLVM_VERSION_MAJOR >= 12 - StructType* pRetType = StructType::getTypeByName(pJitMgr->mContext, "${type['name']}"); -#else - StructType* pRetType = pJitMgr->mpCurrentModule->getTypeByName("${type['name']}"); -#endif - if (pRetType == nullptr) - { - std::vector<Type*> members =<% (max_type_len, max_name_len) = calc_max_len(type['members']) %> - { - %for member in type['members']: - /* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ ${member['type']}, - %endfor - }; - - pRetType = StructType::create(members, "${type['name']}", false); - - // Compute debug metadata - llvm::DIBuilder builder(*pJitMgr->mpCurrentModule); - llvm::DIFile* pFile = builder.createFile("${input_file}", "${os.path.normpath(input_dir).replace('\\', '/')}"); - - std::vector<std::pair<std::string, uint32_t>> dbgMembers = - { - %for member in type['members']: - std::make_pair("${member['name']}", ${pad(len(member['name']), max_name_len)}${member['lineNum']}), - %endfor - }; - pJitMgr->CreateDebugStructType(pRetType, "${type['name']}", pFile, ${type['lineNum']}, dbgMembers); - } - - return pRetType; - } - - %for member in type['members']: - static const uint32_t ${type['name']}_${member['name']} ${pad(len(member['name']), max_name_len)}= ${loop.index}; - %endfor - -%endfor -} // namespace SwrJit - -<%! # Global function definitions - import os - def needs_ctx(struct_type): - for m in struct_type.get('members', []): - if '(ctx)' in m.get('type', ''): - return True - return False - - def calc_max_len(fields): - max_type_len = 0 - max_name_len = 0 - for f in fields: - if len(f['type']) > max_type_len: max_type_len = len(f['type']) - if len(f['name']) > max_name_len: max_name_len = len(f['name']) - return (max_type_len, max_name_len) - - def pad(cur_len, max_len): - pad_amt = max_len - cur_len - return ' '*pad_amt -%> -// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp deleted file mode 100644 index 92e0f406235..00000000000 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp +++ /dev/null @@ -1,44 +0,0 @@ -//============================================================================ -// Copyright (C) 2017 Intel Corporation. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a -// copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice (including the next -// paragraph) shall be included in all copies or substantial portions of the -// Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -// IN THE SOFTWARE. -// -// @file gen_rasterizer${fileNum}.cpp -// -// @brief auto-generated file -// -// DO NOT EDIT -// -// Generation Command Line: -// ${'\n// '.join(cmdline)} -// -//============================================================================ -// clang-format off - -#include "core/rasterizer.h" -#include "core/rasterizer_impl.h" - -void InitRasterizerFuncs${fileNum}() -{ - %for func in funcList: - ${func} - %endfor -} -// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp deleted file mode 100644 index e0800f5e88e..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/formats.cpp +++ /dev/null @@ -1,9298 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file formats.cpp - * - * @brief auto-generated file - * - * DO NOT EDIT - * - ******************************************************************************/ - -#include "formats.h" - -// lookup table for unorm8 srgb -> float conversion -const uint32_t srgb8Table[256] = { - 0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40f, 0x3a9f22b4, 0x3ac6eb61, 0x3aeeb40f, 0x3b0b3e5e, - 0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518d, 0x3b70f18d, 0x3b83e1c6, 0x3b8fe616, 0x3b9c87fd, - 0x3ba9c9b5, 0x3bb7ad6f, 0x3bc63549, 0x3bd5635f, 0x3be539c1, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152, - 0x3c15a703, 0x3c1f45be, 0x3c293e6b, 0x3c3391f7, 0x3c3e4149, 0x3c494d43, 0x3c54b6c7, 0x3c607eb1, - 0x3c6ca5dc, 0x3c792d22, 0x3c830aa8, 0x3c89af9f, 0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63431, - 0x3cadd37d, 0x3cb5a601, 0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d3, 0x3cdfd00e, 0x3ce8ddb9, - 0x3cf22131, 0x3cfb9ac6, 0x3d02a56c, 0x3d0798df, 0x3d0ca7e7, 0x3d11d2b0, 0x3d171965, 0x3d1c7c31, - 0x3d21fb3c, 0x3d2796b2, 0x3d2d4ebe, 0x3d332384, 0x3d39152e, 0x3d3f23e6, 0x3d454fd4, 0x3d4b991f, - 0x3d51ffef, 0x3d58846a, 0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c20f, 0x3d7add25, 0x3d810b66, - 0x3d84b795, 0x3d887330, 0x3d8c3e4a, 0x3d9018f6, 0x3d940345, 0x3d97fd4a, 0x3d9c0716, 0x3da020bb, - 0x3da44a4b, 0x3da883d7, 0x3daccd70, 0x3db12728, 0x3db59110, 0x3dba0b38, 0x3dbe95b5, 0x3dc33092, - 0x3dc7dbe2, 0x3dcc97b6, 0x3dd1641f, 0x3dd6412c, 0x3ddb2eef, 0x3de02d77, 0x3de53cd5, 0x3dea5d19, - 0x3def8e55, 0x3df4d093, 0x3dfa23e8, 0x3dff8861, 0x3e027f07, 0x3e054282, 0x3e080ea5, 0x3e0ae379, - 0x3e0dc107, 0x3e10a755, 0x3e13966c, 0x3e168e53, 0x3e198f11, 0x3e1c98ae, 0x3e1fab32, 0x3e22c6a3, - 0x3e25eb09, 0x3e29186c, 0x3e2c4ed2, 0x3e2f8e45, 0x3e32d6c8, 0x3e362865, 0x3e398322, 0x3e3ce706, - 0x3e405419, 0x3e43ca62, 0x3e4749e8, 0x3e4ad2b1, 0x3e4e64c6, 0x3e52002b, 0x3e55a4e9, 0x3e595307, - 0x3e5d0a8b, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf, 0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c, - 0x3e7c1c38, 0x3e8014c2, 0x3e82203c, 0x3e84308d, 0x3e8645ba, 0x3e885fc5, 0x3e8a7eb2, 0x3e8ca283, - 0x3e8ecb3d, 0x3e90f8e1, 0x3e932b74, 0x3e9562f8, 0x3e979f71, 0x3e99e0e2, 0x3e9c274e, 0x3e9e72b7, - 0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d289, 0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18333, - 0x3eb3fc16, 0x3eb67a15, 0x3eb8fd34, 0x3ebb8576, 0x3ebe12e1, 0x3ec0a571, 0x3ec33d2d, 0x3ec5da17, - 0x3ec87c33, 0x3ecb2383, 0x3ecdd00b, 0x3ed081cd, 0x3ed338cc, 0x3ed5f50b, 0x3ed8b68d, 0x3edb7d54, - 0x3ede4965, 0x3ee11ac1, 0x3ee3f16b, 0x3ee6cd67, 0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba, - 0x3ef56976, 0x3ef86594, 0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8, - 0x3f06f106, 0x3f0884cf, 0x3f0a1b57, 0x3f0bb49d, 0x3f0d50a2, 0x3f0eef69, 0x3f1090f2, 0x3f123540, - 0x3f13dc53, 0x3f15862d, 0x3f1732cf, 0x3f18e23b, 0x3f1a9471, 0x3f1c4973, 0x3f1e0143, 0x3f1fbbe1, - 0x3f217950, 0x3f23398f, 0x3f24fca2, 0x3f26c288, 0x3f288b43, 0x3f2a56d5, 0x3f2c253f, 0x3f2df681, - 0x3f2fca9e, 0x3f31a197, 0x3f337b6c, 0x3f355820, 0x3f3737b3, 0x3f391a26, 0x3f3aff7e, 0x3f3ce7b7, - 0x3f3ed2d4, 0x3f40c0d6, 0x3f42b1c0, 0x3f44a592, 0x3f469c4d, 0x3f4895f3, 0x3f4a9284, 0x3f4c9203, - 0x3f4e9470, 0x3f5099cd, 0x3f52a21a, 0x3f54ad59, 0x3f56bb8c, 0x3f58ccb3, 0x3f5ae0cf, 0x3f5cf7e2, - 0x3f5f11ee, 0x3f612ef2, 0x3f634eef, 0x3f6571ec, 0x3f6797e1, 0x3f69c0d8, 0x3f6beccb, 0x3f6e1bc2, - 0x3f704db6, 0x3f7282b1, 0x3f74baae, 0x3f76f5b3, 0x3f7933b9, 0x3f7b74cb, 0x3f7db8e0, 0x3f800000, -}; - -// order must match SWR_FORMAT -const SWR_FORMAT_INFO gFormatInfo[] = { - - // R32G32B32A32_FLOAT (0x0) - { - "R32G32B32A32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {32, 32, 32, 32}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32B32A32_SINT (0x1) - { - "R32G32B32A32_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {32, 32, 32, 32}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32B32A32_UINT (0x2) - { - "R32G32B32A32_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {32, 32, 32, 32}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x3) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x4) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R64G64_FLOAT (0x5) - { - "R64G64_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {64, 64, 0, 0}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32B32X32_FLOAT (0x6) - { - "R32G32B32X32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {32, 32, 32, 32}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32B32A32_SSCALED (0x7) - { - "R32G32B32A32_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {32, 32, 32, 32}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32B32A32_USCALED (0x8) - { - "R32G32B32A32_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {32, 32, 32, 32}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x9) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xA) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xD) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xF) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x10) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x11) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x12) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x13) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x14) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x15) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x16) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x17) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x18) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x19) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R32G32B32A32_SFIXED (0x20) - { - "R32G32B32A32_SFIXED", - {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {32, 32, 32, 32}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x21) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x22) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x23) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x24) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x25) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x26) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x27) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x28) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x29) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x2A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x2B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x2C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x2D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x2E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x2F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x30) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x31) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x32) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x33) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x34) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x35) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x36) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x37) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x38) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x39) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x3A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x3B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x3C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x3D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x3E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x3F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R32G32B32_FLOAT (0x40) - { - "R32G32B32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {32, 32, 32, 0}, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32B32_SINT (0x41) - { - "R32G32B32_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {32, 32, 32, 0}, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32B32_UINT (0x42) - { - "R32G32B32_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {32, 32, 32, 0}, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x43) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x44) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R32G32B32_SSCALED (0x45) - { - "R32G32B32_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {32, 32, 32, 0}, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32B32_USCALED (0x46) - { - "R32G32B32_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {32, 32, 32, 0}, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x47) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x48) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x49) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x4A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x4B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x4C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x4D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x4E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x4F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R32G32B32_SFIXED (0x50) - { - "R32G32B32_SFIXED", - {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {32, 32, 32, 0}, // Bits per component - 96, // Bits per element - 12, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x51) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x52) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x53) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x54) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x55) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x56) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x57) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x58) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x59) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x5A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x5B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x5C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x5D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x5E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x5F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x60) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x61) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x62) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x63) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x64) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x65) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x66) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x67) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x68) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x69) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x6A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x6B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x6C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x6D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x6E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x6F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x70) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x71) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x72) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x73) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x74) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x75) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x76) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x77) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x78) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x79) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x7A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x7B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x7C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x7D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x7E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x7F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R16G16B16A16_UNORM (0x80) - { - "R16G16B16A16_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {16, 16, 16, 16}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 65535.0f, - 1.0f / 65535.0f, - 1.0f / 65535.0f, - 1.0f / 65535.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16A16_SNORM (0x81) - { - "R16G16B16A16_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {16, 16, 16, 16}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 32767.0f, - 1.0f / 32767.0f, - 1.0f / 32767.0f, - 1.0f / 32767.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16A16_SINT (0x82) - { - "R16G16B16A16_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {16, 16, 16, 16}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16A16_UINT (0x83) - { - "R16G16B16A16_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {16, 16, 16, 16}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16A16_FLOAT (0x84) - { - "R16G16B16A16_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {16, 16, 16, 16}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32_FLOAT (0x85) - { - "R32G32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32_SINT (0x86) - { - "R32G32_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32_UINT (0x87) - { - "R32G32_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32_FLOAT_X8X24_TYPELESS (0x88) - { - "R32_FLOAT_X8X24_TYPELESS", - {SWR_TYPE_FLOAT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // X32_TYPELESS_G8X24_UINT (0x89) - { - "X32_TYPELESS_G8X24_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L32A32_FLOAT (0x8A) - { - "L32A32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x8B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x8C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R64_FLOAT (0x8D) - { - "R64_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {64, 0, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16X16_UNORM (0x8E) - { - "R16G16B16X16_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {16, 16, 16, 16}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16X16_FLOAT (0x8F) - { - "R16G16B16X16_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {16, 16, 16, 16}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x90) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // L32X32_FLOAT (0x91) - { - "L32X32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // I32X32_FLOAT (0x92) - { - "I32X32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16A16_SSCALED (0x93) - { - "R16G16B16A16_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {16, 16, 16, 16}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16A16_USCALED (0x94) - { - "R16G16B16A16_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {16, 16, 16, 16}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32_SSCALED (0x95) - { - "R32G32_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32G32_USCALED (0x96) - { - "R32G32_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x97) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x98) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x99) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x9A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x9B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x9C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x9D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x9E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x9F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R32G32_SFIXED (0xA0) - { - "R32G32_SFIXED", - {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {32, 32, 0, 0}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0xA1) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xA2) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xA3) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xA4) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xA5) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xA6) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xA7) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xA8) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xA9) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xAA) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xAB) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xAC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xAD) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xAE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xAF) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB0) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB1) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB2) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB3) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB4) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB5) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB6) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB7) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB8) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xB9) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xBA) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xBB) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xBC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xBD) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xBE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xBF) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // B8G8R8A8_UNORM (0xC0) - { - "B8G8R8A8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B8G8R8A8_UNORM_SRGB (0xC1) - { - "B8G8R8A8_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R10G10B10A2_UNORM (0xC2) - { - "R10G10B10A2_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R10G10B10A2_UNORM_SRGB (0xC3) - { - "R10G10B10A2_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R10G10B10A2_UINT (0xC4) - { - "R10G10B10A2_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0xC5) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xC6) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R8G8B8A8_UNORM (0xC7) - { - "R8G8B8A8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8A8_UNORM_SRGB (0xC8) - { - "R8G8B8A8_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8A8_SNORM (0xC9) - { - "R8G8B8A8_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8A8_SINT (0xCA) - { - "R8G8B8A8_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8A8_UINT (0xCB) - { - "R8G8B8A8_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16_UNORM (0xCC) - { - "R16G16_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {16, 16, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, false, false}, // Is normalized? - {1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16_SNORM (0xCD) - { - "R16G16_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {16, 16, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, false, false}, // Is normalized? - {1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16_SINT (0xCE) - { - "R16G16_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {16, 16, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16_UINT (0xCF) - { - "R16G16_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {16, 16, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16_FLOAT (0xD0) - { - "R16G16_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {16, 16, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B10G10R10A2_UNORM (0xD1) - { - "B10G10R10A2_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B10G10R10A2_UNORM_SRGB (0xD2) - { - "B10G10R10A2_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R11G11B10_FLOAT (0xD3) - { - "R11G11B10_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {11, 11, 10, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0xD4) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - - // R10G10B10_FLOAT_A2_UNORM (0xD5) - { - "R10G10B10_FLOAT_A2_UNORM", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f / 3.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32_SINT (0xD6) - { - "R32_SINT", - {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32_UINT (0xD7) - { - "R32_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32_FLOAT (0xD8) - { - "R32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R24_UNORM_X8_TYPELESS (0xD9) - { - "R24_UNORM_X8_TYPELESS", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {24, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 16777215.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // X24_TYPELESS_G8_UINT (0xDA) - { - "X24_TYPELESS_G8_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {1, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0xDB) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xDC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // L32_UNORM (0xDD) - { - "L32_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 4294967295.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0xDE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // L16A16_UNORM (0xDF) - { - "L16A16_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {16, 16, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, true, false, false}, // Is normalized? - {1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // I24X8_UNORM (0xE0) - { - "I24X8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {24, 8, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, true, false, false}, // Is normalized? - {1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L24X8_UNORM (0xE1) - { - "L24X8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {24, 8, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, true, false, false}, // Is normalized? - {1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0xE2) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // I32_FLOAT (0xE3) - { - "I32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L32_FLOAT (0xE4) - { - "L32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // A32_FLOAT (0xE5) - { - "A32_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {3, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0xE6) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xE7) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xE8) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // B8G8R8X8_UNORM (0xE9) - { - "B8G8R8X8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B8G8R8X8_UNORM_SRGB (0xEA) - { - "B8G8R8X8_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8X8_UNORM (0xEB) - { - "R8G8B8X8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8X8_UNORM_SRGB (0xEC) - { - "R8G8B8X8_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R9G9B9E5_SHAREDEXP (0xED) - { - "R9G9B9E5_SHAREDEXP", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {9, 9, 9, 5}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B10G10R10X2_UNORM (0xEE) - { - "B10G10R10X2_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0xEF) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // L16A16_FLOAT (0xF0) - { - "L16A16_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {16, 16, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0xF1) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xF2) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R10G10B10X2_USCALED (0xF3) - { - "R10G10B10X2_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8A8_SSCALED (0xF4) - { - "R8G8B8A8_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8A8_USCALED (0xF5) - { - "R8G8B8A8_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16_SSCALED (0xF6) - { - "R16G16_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {16, 16, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16_USCALED (0xF7) - { - "R16G16_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {16, 16, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32_SSCALED (0xF8) - { - "R32_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32_USCALED (0xF9) - { - "R32_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0xFA) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xFB) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xFC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xFD) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xFE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0xFF) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // B5G6R5_UNORM (0x100) - { - "B5G6R5_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 0}, // Swizzle - {5, 6, 5, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B5G6R5_UNORM_SRGB (0x101) - { - "B5G6R5_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 0}, // Swizzle - {5, 6, 5, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 3, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B5G5R5A1_UNORM (0x102) - { - "B5G5R5A1_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {5, 5, 5, 1}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B5G5R5A1_UNORM_SRGB (0x103) - { - "B5G5R5A1_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {5, 5, 5, 1}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B4G4R4A4_UNORM (0x104) - { - "B4G4R4A4_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {4, 4, 4, 4}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B4G4R4A4_UNORM_SRGB (0x105) - { - "B4G4R4A4_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {4, 4, 4, 4}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8_UNORM (0x106) - { - "R8G8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {8, 8, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, false, false}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8_SNORM (0x107) - { - "R8G8_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {8, 8, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, false, false}, // Is normalized? - {1.0f / 127.0f, 1.0f / 127.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8_SINT (0x108) - { - "R8G8_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {8, 8, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8_UINT (0x109) - { - "R8G8_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {8, 8, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16_UNORM (0x10A) - { - "R16_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 65535.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16_SNORM (0x10B) - { - "R16_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 32767.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16_SINT (0x10C) - { - "R16_SINT", - {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16_UINT (0x10D) - { - "R16_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16_FLOAT (0x10E) - { - "R16_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x10F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x110) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // I16_UNORM (0x111) - { - "I16_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 65535.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L16_UNORM (0x112) - { - "L16_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 65535.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // A16_UNORM (0x113) - { - "A16_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {3, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 65535.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L8A8_UNORM (0x114) - { - "L8A8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {8, 8, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, true, false, false}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // I16_FLOAT (0x115) - { - "I16_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L16_FLOAT (0x116) - { - "L16_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // A16_FLOAT (0x117) - { - "A16_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {3, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L8A8_UNORM_SRGB (0x118) - { - "L8A8_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {8, 8, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, true, false, false}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x119) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // B5G5R5X1_UNORM (0x11A) - { - "B5G5R5X1_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {5, 5, 5, 1}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B5G5R5X1_UNORM_SRGB (0x11B) - { - "B5G5R5X1_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {5, 5, 5, 1}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8_SSCALED (0x11C) - { - "R8G8_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {8, 8, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8_USCALED (0x11D) - { - "R8G8_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 0, 0}, // Swizzle - {8, 8, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16_SSCALED (0x11E) - { - "R16_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16_USCALED (0x11F) - { - "R16_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {16, 0, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x120) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x121) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x122) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x123) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // A1B5G5R5_UNORM (0x124) - { - "A1B5G5R5_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {3, 2, 1, 0}, // Swizzle - {1, 5, 5, 5}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 1.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // A4B4G4R4_UNORM (0x125) - { - "A4B4G4R4_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {3, 2, 1, 0}, // Swizzle - {4, 4, 4, 4}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L8A8_UINT (0x126) - { - "L8A8_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {8, 8, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L8A8_SINT (0x127) - { - "L8A8_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 3, 0, 0}, // Swizzle - {8, 8, 0, 0}, // Bits per component - 16, // Bits per element - 2, // Bytes per element - 2, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x128) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x129) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x12A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x12B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x12C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x12D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x12E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x12F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x130) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x131) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x132) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x133) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x134) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x135) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x136) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x137) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x138) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x139) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x13A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x13B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x13C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x13D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x13E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x13F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R8_UNORM (0x140) - { - "R8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8_SNORM (0x141) - { - "R8_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 127.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8_SINT (0x142) - { - "R8_SINT", - {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8_UINT (0x143) - { - "R8_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // A8_UNORM (0x144) - { - "A8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {3, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // I8_UNORM (0x145) - { - "I8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L8_UNORM (0x146) - { - "L8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x147) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x148) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R8_SSCALED (0x149) - { - "R8_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8_USCALED (0x14A) - { - "R8_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x14B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // L8_UNORM_SRGB (0x14C) - { - "L8_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x14D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x14E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x14F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x150) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x151) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // L8_UINT (0x152) - { - "L8_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // L8_SINT (0x153) - { - "L8_SINT", - {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // I8_UINT (0x154) - { - "I8_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // I8_SINT (0x155) - { - "I8_SINT", - {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - true, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x156) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x157) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x158) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x159) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x15A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x15B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x15C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x15D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x15E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x15F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x160) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x161) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x162) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x163) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x164) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x165) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x166) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x167) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x168) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x169) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x16A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x16B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x16C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x16D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x16E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x16F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x170) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x171) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x172) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x173) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x174) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x175) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x176) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x177) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x178) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x179) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x17A) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x17B) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x17C) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x17D) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x17E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x17F) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // DXT1_RGB_SRGB (0x180) - { - "DXT1_RGB_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // padding (0x181) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x182) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // YCRCB_SWAPUVY (0x183) - { - "YCRCB_SWAPUVY", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - true, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 2, // bcWidth - 1, // bcHeight - }, - - // padding (0x184) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x185) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // BC1_UNORM (0x186) - { - "BC1_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC2_UNORM (0x187) - { - "BC2_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC3_UNORM (0x188) - { - "BC3_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC4_UNORM (0x189) - { - "BC4_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC5_UNORM (0x18A) - { - "BC5_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC1_UNORM_SRGB (0x18B) - { - "BC1_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - true, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC2_UNORM_SRGB (0x18C) - { - "BC2_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - true, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC3_UNORM_SRGB (0x18D) - { - "BC3_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - true, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // padding (0x18E) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // YCRCB_SWAPUV (0x18F) - { - "YCRCB_SWAPUV", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - true, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 2, // bcWidth - 1, // bcHeight - }, - - // padding (0x190) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // DXT1_RGB (0x191) - { - "DXT1_RGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // padding (0x192) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R8G8B8_UNORM (0x193) - { - "R8G8B8_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {8, 8, 8, 0}, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8_SNORM (0x194) - { - "R8G8B8_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {8, 8, 8, 0}, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8_SSCALED (0x195) - { - "R8G8B8_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {8, 8, 8, 0}, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8_USCALED (0x196) - { - "R8G8B8_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {8, 8, 8, 0}, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R64G64B64A64_FLOAT (0x197) - { - "R64G64B64A64_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {64, 64, 64, 64}, // Bits per component - 256, // Bits per element - 32, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R64G64B64_FLOAT (0x198) - { - "R64G64B64_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {64, 64, 64, 0}, // Bits per component - 192, // Bits per element - 24, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // BC4_SNORM (0x199) - { - "BC4_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 64, // Bits per element - 8, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 127.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC5_SNORM (0x19A) - { - "BC5_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 127.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // R16G16B16_FLOAT (0x19B) - { - "R16G16B16_FLOAT", - {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {16, 16, 16, 0}, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16_UNORM (0x19C) - { - "R16G16B16_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {16, 16, 16, 0}, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16_SNORM (0x19D) - { - "R16G16B16_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {16, 16, 16, 0}, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16_SSCALED (0x19E) - { - "R16G16B16_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {16, 16, 16, 0}, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16_USCALED (0x19F) - { - "R16G16B16_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {16, 16, 16, 0}, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x1A0) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // BC6H_SF16 (0x1A1) - { - "BC6H_SF16", - {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 127.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC7_UNORM (0x1A2) - { - "BC7_UNORM", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC7_UNORM_SRGB (0x1A3) - { - "BC7_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - true, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // BC6H_UF16 (0x1A4) - { - "BC6H_UF16", - {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 8, 8, 8}, // Bits per component - 128, // Bits per element - 16, // Bytes per element - 1, // Num components - false, // isSRGB - true, // isBC - false, // isSubsampled - false, // isLuminance - {true, false, false, false}, // Is normalized? - {1.0f / 255.0f, 0, 0, 0}, // To float scale factor - 4, // bcWidth - 4, // bcHeight - }, - - // padding (0x1A5) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1A6) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1A7) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R8G8B8_UNORM_SRGB (0x1A8) - { - "R8G8B8_UNORM_SRGB", - {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {8, 8, 8, 0}, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - true, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, false}, // Is normalized? - {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x1A9) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1AA) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1AB) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1AC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1AD) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1AE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1AF) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R16G16B16_UINT (0x1B0) - { - "R16G16B16_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {16, 16, 16, 0}, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R16G16B16_SINT (0x1B1) - { - "R16G16B16_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {16, 16, 16, 0}, // Bits per component - 48, // Bits per element - 6, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R32_SFIXED (0x1B2) - { - "R32_SFIXED", - {SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 0, 0, 0}, // Swizzle - {32, 0, 0, 0}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R10G10B10A2_SNORM (0x1B3) - { - "R10G10B10A2_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R10G10B10A2_USCALED (0x1B4) - { - "R10G10B10A2_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R10G10B10A2_SSCALED (0x1B5) - { - "R10G10B10A2_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R10G10B10A2_SINT (0x1B6) - { - "R10G10B10A2_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B10G10R10A2_SNORM (0x1B7) - { - "B10G10R10A2_SNORM", - {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {true, true, true, true}, // Is normalized? - {1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B10G10R10A2_USCALED (0x1B8) - { - "B10G10R10A2_USCALED", - {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B10G10R10A2_SSCALED (0x1B9) - { - "B10G10R10A2_SSCALED", - {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED}, - {0, 0, 0, 0x3f800000}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B10G10R10A2_UINT (0x1BA) - { - "B10G10R10A2_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // B10G10R10A2_SINT (0x1BB) - { - "B10G10R10A2_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT}, - {0, 0, 0, 0x1}, // Defaults for missing components - {2, 1, 0, 3}, // Swizzle - {10, 10, 10, 2}, // Bits per component - 32, // Bits per element - 4, // Bytes per element - 4, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 1.0f}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x1BC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1BD) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1BE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1BF) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1C0) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1C1) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1C2) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1C3) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1C4) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1C5) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1C6) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1C7) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // R8G8B8_UINT (0x1C8) - { - "R8G8B8_UINT", - {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {8, 8, 8, 0}, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // R8G8B8_SINT (0x1C9) - { - "R8G8B8_SINT", - {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 0}, // Swizzle - {8, 8, 8, 0}, // Bits per component - 24, // Bits per element - 3, // Bytes per element - 3, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 1.0f, 1.0f, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, - - // padding (0x1CA) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1CB) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1CC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1CD) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1CE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1CF) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D0) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D1) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D2) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D3) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D4) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D5) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D6) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D7) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D8) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1D9) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1DA) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1DB) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1DC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1DD) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1DE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1DF) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E0) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E1) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E2) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E3) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E4) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E5) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E6) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E7) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E8) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1E9) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1EA) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1EB) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1EC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1ED) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1EE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1EF) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F0) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F1) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F2) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F3) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F4) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F5) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F6) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F7) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F8) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1F9) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1FA) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1FB) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1FC) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1FD) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // padding (0x1FE) - {nullptr, - {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - {0, 0, 0, 0}, - 0, - 0, - 0, - false, - false, - false, - false, - {false, false, false, false}, - {0.0f, 0.0f, 0.0f, 0.0f}, - 1, - 1}, - // RAW (0x1FF) - { - "RAW", - {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN}, - {0, 0, 0, 0x1}, // Defaults for missing components - {0, 1, 2, 3}, // Swizzle - {8, 0, 0, 0}, // Bits per component - 8, // Bits per element - 1, // Bytes per element - 1, // Num components - false, // isSRGB - false, // isBC - false, // isSubsampled - false, // isLuminance - {false, false, false, false}, // Is normalized? - {1.0f, 0, 0, 0}, // To float scale factor - 1, // bcWidth - 1, // bcHeight - }, -}; diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h deleted file mode 100644 index b7a3e533d15..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/formats.h +++ /dev/null @@ -1,268 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file formats.h - * - * @brief auto-generated file - * - * DO NOT EDIT - * - ******************************************************************************/ - -#pragma once - -#include "common/os.h" - -////////////////////////////////////////////////////////////////////////// -/// SWR_TYPE - Format component type -////////////////////////////////////////////////////////////////////////// -enum SWR_TYPE -{ - SWR_TYPE_UNKNOWN, - SWR_TYPE_UNUSED, - SWR_TYPE_UNORM, - SWR_TYPE_SNORM, - SWR_TYPE_UINT, - SWR_TYPE_SINT, - SWR_TYPE_FLOAT, - SWR_TYPE_SSCALED, - SWR_TYPE_USCALED, - SWR_TYPE_SFIXED, -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_FORMAT -////////////////////////////////////////////////////////////////////////// -enum SWR_FORMAT -{ - R32G32B32A32_FLOAT = 0x0, - R32G32B32A32_SINT = 0x1, - R32G32B32A32_UINT = 0x2, - R64G64_FLOAT = 0x5, - R32G32B32X32_FLOAT = 0x6, - R32G32B32A32_SSCALED = 0x7, - R32G32B32A32_USCALED = 0x8, - R32G32B32A32_SFIXED = 0x20, - R32G32B32_FLOAT = 0x40, - R32G32B32_SINT = 0x41, - R32G32B32_UINT = 0x42, - R32G32B32_SSCALED = 0x45, - R32G32B32_USCALED = 0x46, - R32G32B32_SFIXED = 0x50, - R16G16B16A16_UNORM = 0x80, - R16G16B16A16_SNORM = 0x81, - R16G16B16A16_SINT = 0x82, - R16G16B16A16_UINT = 0x83, - R16G16B16A16_FLOAT = 0x84, - R32G32_FLOAT = 0x85, - R32G32_SINT = 0x86, - R32G32_UINT = 0x87, - R32_FLOAT_X8X24_TYPELESS = 0x88, - X32_TYPELESS_G8X24_UINT = 0x89, - L32A32_FLOAT = 0x8A, - R64_FLOAT = 0x8D, - R16G16B16X16_UNORM = 0x8E, - R16G16B16X16_FLOAT = 0x8F, - L32X32_FLOAT = 0x91, - I32X32_FLOAT = 0x92, - R16G16B16A16_SSCALED = 0x93, - R16G16B16A16_USCALED = 0x94, - R32G32_SSCALED = 0x95, - R32G32_USCALED = 0x96, - R32G32_SFIXED = 0xA0, - B8G8R8A8_UNORM = 0xC0, - B8G8R8A8_UNORM_SRGB = 0xC1, - R10G10B10A2_UNORM = 0xC2, - R10G10B10A2_UNORM_SRGB = 0xC3, - R10G10B10A2_UINT = 0xC4, - R8G8B8A8_UNORM = 0xC7, - R8G8B8A8_UNORM_SRGB = 0xC8, - R8G8B8A8_SNORM = 0xC9, - R8G8B8A8_SINT = 0xCA, - R8G8B8A8_UINT = 0xCB, - R16G16_UNORM = 0xCC, - R16G16_SNORM = 0xCD, - R16G16_SINT = 0xCE, - R16G16_UINT = 0xCF, - R16G16_FLOAT = 0xD0, - B10G10R10A2_UNORM = 0xD1, - B10G10R10A2_UNORM_SRGB = 0xD2, - R11G11B10_FLOAT = 0xD3, - R10G10B10_FLOAT_A2_UNORM = 0xD5, - R32_SINT = 0xD6, - R32_UINT = 0xD7, - R32_FLOAT = 0xD8, - R24_UNORM_X8_TYPELESS = 0xD9, - X24_TYPELESS_G8_UINT = 0xDA, - L32_UNORM = 0xDD, - L16A16_UNORM = 0xDF, - I24X8_UNORM = 0xE0, - L24X8_UNORM = 0xE1, - I32_FLOAT = 0xE3, - L32_FLOAT = 0xE4, - A32_FLOAT = 0xE5, - B8G8R8X8_UNORM = 0xE9, - B8G8R8X8_UNORM_SRGB = 0xEA, - R8G8B8X8_UNORM = 0xEB, - R8G8B8X8_UNORM_SRGB = 0xEC, - R9G9B9E5_SHAREDEXP = 0xED, - B10G10R10X2_UNORM = 0xEE, - L16A16_FLOAT = 0xF0, - R10G10B10X2_USCALED = 0xF3, - R8G8B8A8_SSCALED = 0xF4, - R8G8B8A8_USCALED = 0xF5, - R16G16_SSCALED = 0xF6, - R16G16_USCALED = 0xF7, - R32_SSCALED = 0xF8, - R32_USCALED = 0xF9, - B5G6R5_UNORM = 0x100, - B5G6R5_UNORM_SRGB = 0x101, - B5G5R5A1_UNORM = 0x102, - B5G5R5A1_UNORM_SRGB = 0x103, - B4G4R4A4_UNORM = 0x104, - B4G4R4A4_UNORM_SRGB = 0x105, - R8G8_UNORM = 0x106, - R8G8_SNORM = 0x107, - R8G8_SINT = 0x108, - R8G8_UINT = 0x109, - R16_UNORM = 0x10A, - R16_SNORM = 0x10B, - R16_SINT = 0x10C, - R16_UINT = 0x10D, - R16_FLOAT = 0x10E, - I16_UNORM = 0x111, - L16_UNORM = 0x112, - A16_UNORM = 0x113, - L8A8_UNORM = 0x114, - I16_FLOAT = 0x115, - L16_FLOAT = 0x116, - A16_FLOAT = 0x117, - L8A8_UNORM_SRGB = 0x118, - B5G5R5X1_UNORM = 0x11A, - B5G5R5X1_UNORM_SRGB = 0x11B, - R8G8_SSCALED = 0x11C, - R8G8_USCALED = 0x11D, - R16_SSCALED = 0x11E, - R16_USCALED = 0x11F, - A1B5G5R5_UNORM = 0x124, - A4B4G4R4_UNORM = 0x125, - L8A8_UINT = 0x126, - L8A8_SINT = 0x127, - R8_UNORM = 0x140, - R8_SNORM = 0x141, - R8_SINT = 0x142, - R8_UINT = 0x143, - A8_UNORM = 0x144, - I8_UNORM = 0x145, - L8_UNORM = 0x146, - R8_SSCALED = 0x149, - R8_USCALED = 0x14A, - L8_UNORM_SRGB = 0x14C, - L8_UINT = 0x152, - L8_SINT = 0x153, - I8_UINT = 0x154, - I8_SINT = 0x155, - DXT1_RGB_SRGB = 0x180, - YCRCB_SWAPUVY = 0x183, - BC1_UNORM = 0x186, - BC2_UNORM = 0x187, - BC3_UNORM = 0x188, - BC4_UNORM = 0x189, - BC5_UNORM = 0x18A, - BC1_UNORM_SRGB = 0x18B, - BC2_UNORM_SRGB = 0x18C, - BC3_UNORM_SRGB = 0x18D, - YCRCB_SWAPUV = 0x18F, - DXT1_RGB = 0x191, - R8G8B8_UNORM = 0x193, - R8G8B8_SNORM = 0x194, - R8G8B8_SSCALED = 0x195, - R8G8B8_USCALED = 0x196, - R64G64B64A64_FLOAT = 0x197, - R64G64B64_FLOAT = 0x198, - BC4_SNORM = 0x199, - BC5_SNORM = 0x19A, - R16G16B16_FLOAT = 0x19B, - R16G16B16_UNORM = 0x19C, - R16G16B16_SNORM = 0x19D, - R16G16B16_SSCALED = 0x19E, - R16G16B16_USCALED = 0x19F, - BC6H_SF16 = 0x1A1, - BC7_UNORM = 0x1A2, - BC7_UNORM_SRGB = 0x1A3, - BC6H_UF16 = 0x1A4, - R8G8B8_UNORM_SRGB = 0x1A8, - R16G16B16_UINT = 0x1B0, - R16G16B16_SINT = 0x1B1, - R32_SFIXED = 0x1B2, - R10G10B10A2_SNORM = 0x1B3, - R10G10B10A2_USCALED = 0x1B4, - R10G10B10A2_SSCALED = 0x1B5, - R10G10B10A2_SINT = 0x1B6, - B10G10R10A2_SNORM = 0x1B7, - B10G10R10A2_USCALED = 0x1B8, - B10G10R10A2_SSCALED = 0x1B9, - B10G10R10A2_UINT = 0x1BA, - B10G10R10A2_SINT = 0x1BB, - R8G8B8_UINT = 0x1C8, - R8G8B8_SINT = 0x1C9, - RAW = 0x1FF, - NUM_SWR_FORMATS = 0x200, -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_FORMAT_INFO - Format information -////////////////////////////////////////////////////////////////////////// -struct SWR_FORMAT_INFO -{ - const char* name; - SWR_TYPE type[4]; - uint32_t defaults[4]; - uint32_t swizzle[4]; ///< swizzle per component - uint32_t bpc[4]; ///< bits per component - uint32_t bpp; ///< bits per pixel - uint32_t Bpp; ///< bytes per pixel - uint32_t numComps; ///< number of components - bool isSRGB; - bool isBC; - bool isSubsampled; - bool isLuminance; - bool isNormalized[4]; - float toFloat[4]; - uint32_t bcWidth; - uint32_t bcHeight; -}; - -extern const SWR_FORMAT_INFO gFormatInfo[NUM_SWR_FORMATS]; - -////////////////////////////////////////////////////////////////////////// -/// @brief Retrieves format info struct for given format. -/// @param format - SWR format -INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format) -{ - SWR_ASSERT(format < NUM_SWR_FORMATS, "Invalid Surface Format: %d", format); - SWR_ASSERT(gFormatInfo[format].name != nullptr, "Invalid Surface Format: %d", format); - return gFormatInfo[format]; -} - -// lookup table for unorm8 srgb -> float conversion -extern const uint32_t srgb8Table[256]; diff --git a/src/gallium/drivers/swr/rasterizer/common/intrin.h b/src/gallium/drivers/swr/rasterizer/common/intrin.h deleted file mode 100644 index 95b462b1e36..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/intrin.h +++ /dev/null @@ -1,120 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#ifndef __SWR_INTRIN_H__ -#define __SWR_INTRIN_H__ - -#include "os.h" - -#if !defined(SIMD_ARCH) -#define SIMD_ARCH KNOB_ARCH -#endif - -#include "simdlib_types.hpp" - -typedef SIMDImpl::SIMD128Impl::Float simd4scalar; -typedef SIMDImpl::SIMD128Impl::Double simd4scalard; -typedef SIMDImpl::SIMD128Impl::Integer simd4scalari; -typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector; -typedef SIMDImpl::SIMD128Impl::Mask simd4mask; - -typedef SIMDImpl::SIMD256Impl::Float simd8scalar; -typedef SIMDImpl::SIMD256Impl::Double simd8scalard; -typedef SIMDImpl::SIMD256Impl::Integer simd8scalari; -typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector; -typedef SIMDImpl::SIMD256Impl::Mask simd8mask; - -typedef SIMDImpl::SIMD512Impl::Float simd16scalar; -typedef SIMDImpl::SIMD512Impl::Double simd16scalard; -typedef SIMDImpl::SIMD512Impl::Integer simd16scalari; -typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector; -typedef SIMDImpl::SIMD512Impl::Mask simd16mask; - -#if KNOB_SIMD_WIDTH == 8 -typedef simd8scalar simdscalar; -typedef simd8scalard simdscalard; -typedef simd8scalari simdscalari; -typedef simd8vector simdvector; -typedef simd8mask simdmask; -#else -#error Unsupported vector width -#endif - -INLINE -UINT pdep_u32(UINT a, UINT mask) -{ -#if KNOB_ARCH >= KNOB_ARCH_AVX2 - return _pdep_u32(a, mask); -#else - UINT result = 0; - - // copied from http://wm.ite.pl/articles/pdep-soft-emu.html - // using bsf instead of funky loop - unsigned long maskIndex = 0; - while (_BitScanForward(&maskIndex, mask)) - { - // 1. isolate lowest set bit of mask - const UINT lowest = 1 << maskIndex; - - // 2. populate LSB from src - const UINT LSB = (UINT)((int)(a << 31) >> 31); - - // 3. copy bit from mask - result |= LSB & lowest; - - // 4. clear lowest bit - mask &= ~lowest; - - // 5. prepare for next iteration - a >>= 1; - } - - return result; -#endif -} - -INLINE -UINT pext_u32(UINT a, UINT mask) -{ -#if KNOB_ARCH >= KNOB_ARCH_AVX2 - return _pext_u32(a, mask); -#else - UINT result = 0; - unsigned long maskIndex; - uint32_t currentBit = 0; - while (_BitScanForward(&maskIndex, mask)) - { - // 1. isolate lowest set bit of mask - const UINT lowest = 1 << maskIndex; - - // 2. copy bit from mask - result |= ((a & lowest) > 0) << currentBit++; - - // 3. clear lowest bit - mask &= ~lowest; - } - return result; -#endif -} - -#endif //__SWR_INTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/common/isa.hpp b/src/gallium/drivers/swr/rasterizer/common/isa.hpp deleted file mode 100644 index 41af0055f1e..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/isa.hpp +++ /dev/null @@ -1,231 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#pragma once - -#include <iostream> -#include <vector> -#include <bitset> -#include <array> -#include <string> -#include <algorithm> - -// Clang for Windows does supply an intrin.h with __cpuid intrinsics, however... -// It seems to not realize that a write to "b" (ebx) will kill the value in rbx. -// This attempts to use the "native" clang / gcc intrinsics instead of the windows -// compatible ones. -#if defined(_MSC_VER) && !defined(__clang__) -#include <intrin.h> -#else -#include <string.h> -#if !defined(__cpuid) -#include <cpuid.h> -#endif -#endif - -class InstructionSet -{ -public: - InstructionSet() : CPU_Rep(){}; - - // getters - std::string Vendor(void) { return CPU_Rep.vendor_; } - std::string Brand(void) { return CPU_Rep.brand_; } - - bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; } - bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; } - bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; } - bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; } - bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; } - bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; } - bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; } - bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; } - bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; } - bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; } - bool AES(void) { return CPU_Rep.f_1_ECX_[25]; } - bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; } - bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; } - bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; } - - bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; } - bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; } - bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; } - bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; } - bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; } - bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; } - bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; } - bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; } - bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; } - - bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; } - bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; } - bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; } - bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; } - bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; } - bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; } - bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; } - bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; } - bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; } - bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; } - - bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; } - - bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; } - bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; } - bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; } - bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; } - bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; } - bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; } - - bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; } - bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; } - bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; } - bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; } - bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; } - - bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; } - bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; } - bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; } - bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; } - bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; } - bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; } - bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; } - -private: - class InstructionSet_Internal - { - public: - InstructionSet_Internal() : - nIds_{0}, nExIds_{0}, isIntel_{false}, isAMD_{false}, f_1_ECX_{0}, f_1_EDX_{0}, - f_7_EBX_{0}, f_7_ECX_{0}, f_81_ECX_{0}, f_81_EDX_{0}, data_{}, extdata_{} - { - // int cpuInfo[4] = {-1}; - std::array<int, 4> cpui; - - // Calling __cpuid with 0x0 as the function_id argument - // gets the number of the highest valid function ID. -#if defined(_MSC_VER) && !defined(__clang__) - __cpuid(cpui.data(), 0); - nIds_ = cpui[0]; -#else - nIds_ = __get_cpuid_max(0, NULL); -#endif - - for (int i = 0; i <= nIds_; ++i) - { -#if defined(_MSC_VER) && !defined(__clang__) - __cpuidex(cpui.data(), i, 0); -#else - int* data = cpui.data(); - __cpuid_count(i, 0, data[0], data[1], data[2], data[3]); -#endif - data_.push_back(cpui); - } - - // Capture vendor string - char vendor[0x20]; - memset(vendor, 0, sizeof(vendor)); - *reinterpret_cast<int*>(vendor) = data_[0][1]; - *reinterpret_cast<int*>(vendor + 4) = data_[0][3]; - *reinterpret_cast<int*>(vendor + 8) = data_[0][2]; - vendor_ = vendor; - if (vendor_ == "GenuineIntel") - { - isIntel_ = true; - } - else if (vendor_ == "AuthenticAMD") - { - isAMD_ = true; - } - - // load bitset with flags for function 0x00000001 - if (nIds_ >= 1) - { - f_1_ECX_ = data_[1][2]; - f_1_EDX_ = data_[1][3]; - } - - // load bitset with flags for function 0x00000007 - if (nIds_ >= 7) - { - f_7_EBX_ = data_[7][1]; - f_7_ECX_ = data_[7][2]; - } - - // Calling __cpuid with 0x80000000 as the function_id argument - // gets the number of the highest valid extended ID. -#if defined(_MSC_VER) && !defined(__clang__) - __cpuid(cpui.data(), 0x80000000); - nExIds_ = cpui[0]; -#else - nExIds_ = __get_cpuid_max(0x80000000, NULL); -#endif - - char brand[0x40]; - memset(brand, 0, sizeof(brand)); - - for (unsigned i = 0x80000000; i <= nExIds_; ++i) - { -#if defined(_MSC_VER) && !defined(__clang__) - __cpuidex(cpui.data(), i, 0); -#else - int* data = cpui.data(); - __cpuid_count(i, 0, data[0], data[1], data[2], data[3]); -#endif - extdata_.push_back(cpui); - } - - // load bitset with flags for function 0x80000001 - if (nExIds_ >= 0x80000001) - { - f_81_ECX_ = extdata_[1][2]; - f_81_EDX_ = extdata_[1][3]; - } - - // Interpret CPU brand string if reported - if (nExIds_ >= 0x80000004) - { - memcpy(brand, extdata_[2].data(), sizeof(cpui)); - memcpy(brand + 16, extdata_[3].data(), sizeof(cpui)); - memcpy(brand + 32, extdata_[4].data(), sizeof(cpui)); - brand_ = brand; - } - }; - - int nIds_; - unsigned nExIds_; - std::string vendor_; - std::string brand_; - bool isIntel_; - bool isAMD_; - std::bitset<32> f_1_ECX_; - std::bitset<32> f_1_EDX_; - std::bitset<32> f_7_EBX_; - std::bitset<32> f_7_ECX_; - std::bitset<32> f_81_ECX_; - std::bitset<32> f_81_EDX_; - std::vector<std::array<int, 4>> data_; - std::vector<std::array<int, 4>> extdata_; - }; - const InstructionSet_Internal CPU_Rep; -}; diff --git a/src/gallium/drivers/swr/rasterizer/common/os.cpp b/src/gallium/drivers/swr/rasterizer/common/os.cpp deleted file mode 100644 index 75c7161b4e2..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/os.cpp +++ /dev/null @@ -1,314 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#include "common/os.h" -#include <vector> -#include <array> -#include <sstream> - -#if defined(_WIN32) -#include <shlobj.h> -#endif // Windows - -#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) -#include <pthread.h> -#endif // Linux - -#if defined(_MSC_VER) -static const DWORD MS_VC_EXCEPTION = 0x406D1388; - -#pragma pack(push, 8) -typedef struct tagTHREADNAME_INFO -{ - DWORD dwType; // Must be 0x1000. - LPCSTR szName; // Pointer to name (in user addr space). - DWORD dwThreadID; // Thread ID (-1=caller thread). - DWORD dwFlags; // Reserved for future use, must be zero. -} THREADNAME_INFO; -#pragma pack(pop) - -void LegacySetThreadName(const char* pThreadName) -{ - THREADNAME_INFO info; - info.dwType = 0x1000; - info.szName = pThreadName; - info.dwThreadID = GetCurrentThreadId(); - info.dwFlags = 0; - - if (!IsDebuggerPresent()) - { - // No debugger attached to interpret exception, no need to actually do it - return; - } - -#pragma warning(push) -#pragma warning(disable : 6320 6322) - __try - { - RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info); - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - } -#pragma warning(pop) -} -#endif // _WIN32 - -void SWR_API SetCurrentThreadName(const char* pThreadName) -{ -#if defined(_MSC_VER) - // The SetThreadDescription API was brought in version 1607 of Windows 10. - typedef HRESULT(WINAPI * PFNSetThreadDescription)(HANDLE hThread, PCWSTR lpThreadDescription); - // The SetThreadDescription API works even if no debugger is attached. - auto pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>( - GetProcAddress(GetModuleHandleA("Kernel32.dll"), "SetThreadDescription")); - - if (!pfnSetThreadDescription) - { - // try KernelBase.dll - pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>( - GetProcAddress(GetModuleHandleA("KernelBase.dll"), "SetThreadDescription")); - } - - if (pfnSetThreadDescription) - { - std::string utf8Name = pThreadName; - std::wstring wideName; - wideName.resize(utf8Name.size() + 1); - swprintf_s(&(wideName.front()), wideName.size(), L"%S", utf8Name.c_str()); - HRESULT hr = pfnSetThreadDescription(GetCurrentThread(), wideName.c_str()); - SWR_ASSERT(SUCCEEDED(hr), "Failed to set thread name to %s", pThreadName); - - // Fall through - it seems like some debuggers only recognize the exception - } - - // Fall back to exception based hack - LegacySetThreadName(pThreadName); -#endif // _WIN32 - -#if defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) - pthread_setname_np(pthread_self(), pThreadName); -#endif // Linux -} - -#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) -static void -SplitString(std::vector<std::string>& out_segments, const std::string& input, char splitToken) -{ - out_segments.clear(); - - std::istringstream f(input); - std::string s; - while (std::getline(f, s, splitToken)) - { - if (s.size()) - { - out_segments.push_back(s); - } - } -} -#endif // Unix - -void SWR_API CreateDirectoryPath(const std::string& path) -{ -#if defined(_WIN32) - SHCreateDirectoryExA(nullptr, path.c_str(), nullptr); -#endif // Windows - -#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) - std::vector<std::string> pathSegments; - SplitString(pathSegments, path, '/'); - - std::string tmpPath; - for (auto const& segment : pathSegments) - { - tmpPath.push_back('/'); - tmpPath += segment; - - int result = mkdir(tmpPath.c_str(), 0777); - if (result == -1 && errno != EEXIST) - { - break; - } - } -#endif // Unix -} - -/// Execute Command (block until finished) -/// @returns process exit value -int SWR_API ExecCmd(const std::string& cmd, ///< (In) Command line string - const char* pOptEnvStrings, ///< (Optional In) Environment block for new process - std::string* pOptStdOut, ///< (Optional Out) Standard Output text - std::string* pOptStdErr, ///< (Optional Out) Standard Error text - const std::string* pOptStdIn) ///< (Optional In) Standard Input text -{ - int rvalue = -1; - -#if defined(_WIN32) - struct WinPipe - { - HANDLE hRead; - HANDLE hWrite; - }; - std::array<WinPipe, 3> hPipes = {}; - - SECURITY_ATTRIBUTES saAttr = {sizeof(SECURITY_ATTRIBUTES)}; - saAttr.bInheritHandle = TRUE; // Pipe handles are inherited by child process. - saAttr.lpSecurityDescriptor = NULL; - - { - bool bFail = false; - for (WinPipe& p : hPipes) - { - if (!CreatePipe(&p.hRead, &p.hWrite, &saAttr, 0)) - { - bFail = true; - } - } - - if (bFail) - { - for (WinPipe& p : hPipes) - { - CloseHandle(p.hRead); - CloseHandle(p.hWrite); - } - return rvalue; - } - } - - STARTUPINFOA StartupInfo{}; - StartupInfo.cb = sizeof(STARTUPINFOA); - StartupInfo.dwFlags = STARTF_USESTDHANDLES; - StartupInfo.dwFlags |= STARTF_USESHOWWINDOW; - StartupInfo.wShowWindow = SW_HIDE; - if (pOptStdIn) - { - StartupInfo.hStdInput = hPipes[0].hRead; - } - StartupInfo.hStdOutput = hPipes[1].hWrite; - StartupInfo.hStdError = hPipes[2].hWrite; - PROCESS_INFORMATION procInfo{}; - - // CreateProcess can modify the string - std::string local_cmd = cmd; - - BOOL ProcessValue = CreateProcessA(NULL, - (LPSTR)local_cmd.c_str(), - NULL, - NULL, - TRUE, - 0, - (LPVOID)pOptEnvStrings, - NULL, - &StartupInfo, - &procInfo); - - if (ProcessValue && procInfo.hProcess) - { - auto ReadFromPipe = [](HANDLE hPipe, std::string* pOutStr) { - char buf[1024]; - DWORD dwRead = 0; - DWORD dwAvail = 0; - while (true) - { - if (!::PeekNamedPipe(hPipe, NULL, 0, NULL, &dwAvail, NULL)) - { - break; - } - - if (!dwAvail) // no data available, return - { - break; - } - - if (!::ReadFile(hPipe, - buf, - std::min<size_t>(sizeof(buf) - 1, size_t(dwAvail)), - &dwRead, - NULL) || - !dwRead) - { - // error, the child process might ended - break; - } - - buf[dwRead] = 0; - if (pOutStr) - { - (*pOutStr) += buf; - } - } - }; - bool bProcessEnded = false; - size_t bytesWritten = 0; - do - { - if (pOptStdIn && (pOptStdIn->size() > bytesWritten)) - { - DWORD bytesToWrite = static_cast<DWORD>(pOptStdIn->size()) - bytesWritten; - if (!::WriteFile(hPipes[0].hWrite, - pOptStdIn->data() + bytesWritten, - bytesToWrite, - &bytesToWrite, - nullptr)) - { - // Failed to write to pipe - break; - } - bytesWritten += bytesToWrite; - } - - // Give some timeslice (50ms), so we won't waste 100% cpu. - bProcessEnded = (WaitForSingleObject(procInfo.hProcess, 50) == WAIT_OBJECT_0); - - ReadFromPipe(hPipes[1].hRead, pOptStdOut); - ReadFromPipe(hPipes[2].hRead, pOptStdErr); - } while (!bProcessEnded); - - DWORD exitVal = 0; - if (!GetExitCodeProcess(procInfo.hProcess, &exitVal)) - { - exitVal = 1; - } - - CloseHandle(procInfo.hProcess); - CloseHandle(procInfo.hThread); - - rvalue = exitVal; - } - - for (WinPipe& p : hPipes) - { - CloseHandle(p.hRead); - CloseHandle(p.hWrite); - } - -#else - - // Non-Windows implementation - -#endif - - return rvalue; -} diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h deleted file mode 100644 index ed42e1eb79e..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ /dev/null @@ -1,365 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#ifndef __SWR_OS_H__ -#define __SWR_OS_H__ - -#include <cstddef> -#include "core/knobs.h" - -#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX) - -#define SWR_API __cdecl -#define SWR_VISIBLE __declspec(dllexport) - -#ifndef NOMINMAX -#undef UNICODE -#define NOMINMAX -#include <windows.h> -#undef NOMINMAX -#define UNICODE -#else -#undef UNICODE -#include <windows.h> -#define UNICODE -#endif -#include <intrin.h> -#include <cstdint> - -#if defined(MemoryFence) -// Windows.h defines MemoryFence as _mm_mfence, but this conflicts with llvm::sys::MemoryFence -#undef MemoryFence -#endif - -#if defined(_MSC_VER) -#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD -#elif defined(__GNUC__) -#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH))) -#endif - -#if defined(_DEBUG) -// We compile Debug builds with inline function expansion enabled. This allows -// functions compiled with __forceinline to be inlined even in Debug builds. -// The inline_depth(0) pragma below will disable inline function expansion for -// normal INLINE / inline functions, but not for __forceinline functions. -// Our SIMD function wrappers (see simdlib.hpp) use __forceinline even in -// Debug builds. -#define INLINE inline -#pragma inline_depth(0) -#else -// Use of __forceinline increases compile time dramatically in release builds -// and provides almost 0 measurable benefit. Disable until we have a compelling -// use-case -// #define INLINE __forceinline -#define INLINE inline -#endif -#ifndef FORCEINLINE -#define FORCEINLINE __forceinline -#endif - -#define DEBUGBREAK __debugbreak() - -#define PRAGMA_WARNING_PUSH_DISABLE(...) \ - __pragma(warning(push)); \ - __pragma(warning(disable : __VA_ARGS__)); - -#define PRAGMA_WARNING_POP() __pragma(warning(pop)) - -static inline void* AlignedMalloc(size_t _Size, size_t _Alignment) -{ - return _aligned_malloc(_Size, _Alignment); -} - -static inline void AlignedFree(void* p) -{ - return _aligned_free(p); -} - -#if defined(_WIN64) -#define BitScanReverseSizeT BitScanReverse64 -#define BitScanForwardSizeT BitScanForward64 -#define _mm_popcount_sizeT _mm_popcnt_u64 -#else -#define BitScanReverseSizeT BitScanReverse -#define BitScanForwardSizeT BitScanForward -#define _mm_popcount_sizeT _mm_popcnt_u32 -#endif - -#if !defined(_WIN64) -extern "C" { -inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask) -{ - if (Mask == 0) - return 0; -#ifdef __GNUC__ - *Index = __builtin_ctzll(Mask); -#else - *Index = 0; - for (int i = 0; i < 64; ++ i) - if ((1ULL << i) & Mask) - *Index = i; -#endif - return 1; -} - -inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask) -{ - if (Mask == 0) - return 0; -#ifdef __GNUC__ - *Index = 63 - __builtin_clzll(Mask); -#else - *Index = 0; - for (int i = 63; i >= 0; -- i) - if ((1ULL << i) & Mask) - *Index = i; -#endif - return 1; -} -} -#endif - -#elif defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) - -#define SWR_API -#define SWR_VISIBLE __attribute__((visibility("default"))) - -#include <stdlib.h> -#include <string.h> -#include <x86intrin.h> -#include <stdint.h> -#include <sys/types.h> -#include <unistd.h> -#include <sys/stat.h> -#include <stdio.h> -#include <limits.h> - -typedef void VOID; -typedef void* LPVOID; -typedef int INT; -typedef unsigned int UINT; -typedef void* HANDLE; -typedef int LONG; -typedef unsigned int DWORD; - -#undef FALSE -#define FALSE 0 - -#undef TRUE -#define TRUE 1 - -#define MAX_PATH PATH_MAX - -#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH))) -#ifndef INLINE -#define INLINE __inline -#endif -#ifndef FORCEINLINE -#define FORCEINLINE INLINE -#endif -#define DEBUGBREAK asm("int $3") - -#if !defined(__CYGWIN__) - -#ifndef __cdecl -#define __cdecl -#endif -#ifndef __stdcall -#define __stdcall -#endif - -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) -#define __declspec(x) __declspec_##x -#define __declspec_align(y) __attribute__((aligned(y))) -#define __declspec_deprecated __attribute__((deprecated)) -#define __declspec_dllexport -#define __declspec_dllimport -#define __declspec_noinline __attribute__((__noinline__)) -#define __declspec_nothrow __attribute__((nothrow)) -#define __declspec_novtable -#define __declspec_thread __thread -#else -#define __declspec(X) -#endif - -#endif - -#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) - -#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500) -inline uint64_t __rdtsc() -{ - long low, high; - asm volatile("rdtsc" : "=a"(low), "=d"(high)); - return (low | ((uint64_t)high << 32)); -} -#endif - -#if !defined(__clang__) && !defined(__INTEL_COMPILER) -// Intrinsic not defined in gcc < 10 -#if (__GNUC__) && (GCC_VERSION < 100000) -static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a) -{ - _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a)); - _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1)); -} -#endif - -// gcc prior to 4.9 doesn't have _mm*_undefined_* -#if (__GNUC__) && (GCC_VERSION < 40900) -#define _mm_undefined_si128 _mm_setzero_si128 -#define _mm256_undefined_ps _mm256_setzero_ps -#endif -#endif - -inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask) -{ - if (Mask == 0) - return 0; - *Index = __builtin_ctzll(Mask); - return 1; -} - -inline unsigned char _BitScanForward(unsigned long* Index, uint32_t Mask) -{ - if (Mask == 0) - return 0; - *Index = __builtin_ctz(Mask); - return 1; -} - -inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask) -{ - if (Mask == 0) - return 0; - *Index = 63 - __builtin_clzll(Mask); - return 1; -} - -inline unsigned char _BitScanReverse(unsigned long* Index, uint32_t Mask) -{ - if (Mask == 0) - return 0; - *Index = 31 - __builtin_clz(Mask); - return 1; -} - -inline void* AlignedMalloc(size_t size, size_t alignment) -{ - void* ret; - if (posix_memalign(&ret, alignment, size)) - { - return NULL; - } - return ret; -} - -static inline void AlignedFree(void* p) -{ - free(p); -} - -#define _countof(a) (sizeof(a) / sizeof(*(a))) - -#define sprintf_s sprintf -#define strcpy_s(dst, size, src) strncpy(dst, src, size) -#define GetCurrentProcessId getpid - -#define InterlockedCompareExchange(Dest, Exchange, Comparand) \ - __sync_val_compare_and_swap(Dest, Comparand, Exchange) -#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value) -#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1) -#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1) -#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1) -#define InterlockedAdd(Addend, Value) __sync_add_and_fetch(Addend, Value) -#define InterlockedAdd64(Addend, Value) __sync_add_and_fetch(Addend, Value) -#define _ReadWriteBarrier() asm volatile("" ::: "memory") - -#define PRAGMA_WARNING_PUSH_DISABLE(...) -#define PRAGMA_WARNING_POP() - -#define ZeroMemory(dst, size) memset(dst, 0, size) -#else - -#error Unsupported OS/system. - -#endif - -#define THREAD thread_local - -// Universal types -typedef uint8_t KILOBYTE[1024]; -typedef KILOBYTE MEGABYTE[1024]; -typedef MEGABYTE GIGABYTE[1024]; - -#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64) -#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES) -#define OSALIGNSIMD16(RWORD) OSALIGN(RWORD, KNOB_SIMD16_BYTES) - -#include "common/swr_assert.h" - -#ifdef __GNUC__ -#define ATTR_UNUSED __attribute__((unused)) -#else -#define ATTR_UNUSED -#endif - -#define SWR_FUNC(_retType, _funcName, /* args */...) \ - typedef _retType(SWR_API* PFN##_funcName)(__VA_ARGS__); \ - _retType SWR_API _funcName(__VA_ARGS__); - -// Defined in os.cpp -void SWR_API SetCurrentThreadName(const char* pThreadName); -void SWR_API CreateDirectoryPath(const std::string& path); - -/// Execute Command (block until finished) -/// @returns process exit value -int SWR_API - ExecCmd(const std::string& cmd, ///< (In) Command line string - const char* pOptEnvStrings = nullptr, ///< (Optional In) Environment block for new process - std::string* pOptStdOut = nullptr, ///< (Optional Out) Standard Output text - std::string* pOptStdErr = nullptr, ///< (Optional Out) Standard Error text - const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text - - -/// Helper for setting up FP state -/// @returns old csr state -static INLINE uint32_t SetOptimalVectorCSR() -{ - uint32_t oldCSR = _mm_getcsr(); - - uint32_t newCSR = (oldCSR & ~(_MM_ROUND_MASK | _MM_DENORMALS_ZERO_MASK | _MM_FLUSH_ZERO_MASK)); - newCSR |= (_MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); - _mm_setcsr(newCSR); - - return oldCSR; -} - -/// Set Vector CSR state. -/// @param csrState - should be value returned from SetOptimalVectorCSR() -static INLINE void RestoreVectorCSR(uint32_t csrState) -{ - _mm_setcsr(csrState); -} - -#endif //__SWR_OS_H__ diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp deleted file mode 100644 index e2076e8fc44..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp +++ /dev/null @@ -1,192 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file rdtsc_buckets.cpp - * - * @brief implementation of rdtsc buckets. - * - * Notes: - * - ******************************************************************************/ -#include "rdtsc_buckets.h" -#include <inttypes.h> - -#if defined(_WIN32) -#define PATH_SEPARATOR "\\" -#elif defined(__unix__) || defined(__APPLE__) -#define PATH_SEPARATOR "/" -#else -#error "Unsupported platform" -#endif - -THREAD UINT tlsThreadId = 0; - -BucketManager::~BucketManager() -{ -} - -void BucketManager::RegisterThread(const std::string& name) -{ - - BUCKET_THREAD newThread; - newThread.name = name; - newThread.root.children.reserve(mBuckets.size()); - newThread.root.id = 0; - newThread.root.pParent = nullptr; - newThread.pCurrent = &newThread.root; - - mThreadMutex.lock(); - - // assign unique thread id for this thread - size_t id = mThreads.size(); - newThread.id = (UINT)id; - tlsThreadId = (UINT)id; - - // store new thread - mThreads.push_back(newThread); - - mThreadMutex.unlock(); -} - -UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc) -{ - mThreadMutex.lock(); - size_t id = mBuckets.size(); - mBuckets.push_back(desc); - mThreadMutex.unlock(); - return (UINT)id; -} - -void BucketManager::PrintBucket( - FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket) -{ - const char* arrows[] = { - "", - "|-> ", - " |-> ", - " |-> ", - " |-> ", - " |-> ", - " |-> ", - " |-> ", - " |-> ", - }; - - // compute percent of total cycles used by this bucket - float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0); - - // compute percent of parent cycles used by this bucket - float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0); - - // compute average cycle count per invocation - uint64_t CPE = bucket.elapsed / bucket.count; - - BUCKET_DESC& desc = mBuckets[bucket.id]; - - // construct hierarchy visualization - std::string str = arrows[level]; - str += desc.name; - char hier[80]; - strcpy_s(hier, sizeof(hier)-1, str.c_str()); - - // print out - fprintf(f, - "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n", - percentTotal, - percentParent, - bucket.elapsed, - CPE, - bucket.count, - (unsigned long)0, - (uint32_t)0, - hier); - - // dump all children of this bucket - for (const BUCKET& child : bucket.children) - { - if (child.count) - { - PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child); - } - } -} - -void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread) -{ - // print header - fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str()); - fprintf(f, " %%Tot %%Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket\n"); - - // compute thread level total cycle counts across all buckets from root - const BUCKET& root = thread.root; - uint64_t totalCycles = 0; - for (const BUCKET& child : root.children) - { - totalCycles += child.elapsed; - } - - for (const BUCKET& child : root.children) - { - if (child.count) - { - PrintBucket(f, 0, totalCycles, totalCycles, child); - } - } -} - -void BucketManager::PrintReport(const std::string& filename) -{ - { - FILE* f = fopen(filename.c_str(), "w"); - assert(f); - - mThreadMutex.lock(); - for (const BUCKET_THREAD& thread : mThreads) - { - PrintThread(f, thread); - fprintf(f, "\n"); - } - - mThreadMutex.unlock(); - - fclose(f); - } -} - - -void BucketManager::StartCapture() -{ - - printf("Capture Starting\n"); - - mCapturing = true; -} - -void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id) -{ - pBucketMgr->StartBucket(id); -} - -void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id) -{ - pBucketMgr->StopBucket(id); -} diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h deleted file mode 100644 index b00cbf63eba..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h +++ /dev/null @@ -1,227 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file rdtsc_buckets.h - * - * @brief declaration for rdtsc buckets. - * - * Notes: - * - ******************************************************************************/ -#pragma once - -#include "os.h" -#include <vector> -#include <mutex> -#include <sstream> - -#include "rdtsc_buckets_shared.h" - - -// unique thread id stored in thread local storage -extern THREAD UINT tlsThreadId; - -////////////////////////////////////////////////////////////////////////// -/// @brief BucketManager encapsulates a single instance of the buckets -/// functionality. There can be one or many bucket managers active -/// at any time. The manager owns all the threads and -/// bucket information that have been registered to it. -class BucketManager -{ -public: - - uint32_t mCurrentFrame; - std::vector<uint32_t> mBucketMap; - bool mBucketsInitialized; - std::string mBucketMgrName; - - - BucketManager(std::string name) : mCurrentFrame(0), mBucketsInitialized(false), mBucketMgrName(name) - { - mBucketMap.clear(); - } - ~BucketManager(); - - // removes all registered thread data - void ClearThreads() - { - mThreadMutex.lock(); - mThreads.clear(); - mThreadMutex.unlock(); - } - - // removes all registered buckets - void ClearBuckets() - { - mThreadMutex.lock(); - mBuckets.clear(); - mThreadMutex.unlock(); - } - - /// Registers a new thread with the manager. - /// @param name - name of thread, used for labels in reports and threadviz - void RegisterThread(const std::string& name); - - /// Registers a new bucket type with the manager. Returns a unique - /// id which should be used in subsequent calls to start/stop the bucket - /// @param desc - description of the bucket - /// @return unique id - UINT RegisterBucket(const BUCKET_DESC& desc); - - // print report - void PrintReport(const std::string& filename); - - - // start capturing - void StartCapture(); - - // stop capturing - INLINE void StopCapture() - { - mCapturing = false; - - // wait for all threads to pop back to root bucket - bool stillCapturing = true; - while (stillCapturing) - { - stillCapturing = false; - for (const BUCKET_THREAD& t : mThreads) - { - if (t.level > 0) - { - stillCapturing = true; - continue; - } - } - } - - mDoneCapturing = true; - printf("Capture Stopped\n"); - } - - // start a bucket - // @param id generated by RegisterBucket - INLINE void StartBucket(UINT id) - { - if (!mCapturing) - return; - - SWR_ASSERT(tlsThreadId < mThreads.size()); - - BUCKET_THREAD& bt = mThreads[tlsThreadId]; - - uint64_t tsc = __rdtsc(); - - { - if (bt.pCurrent->children.size() < mBuckets.size()) - { - bt.pCurrent->children.resize(mBuckets.size()); - } - BUCKET& child = bt.pCurrent->children[id]; - child.pParent = bt.pCurrent; - child.id = id; - child.start = tsc; - - // update thread's currently executing bucket - bt.pCurrent = &child; - } - - - bt.level++; - } - - // stop the currently executing bucket - INLINE void StopBucket(UINT id) - { - SWR_ASSERT(tlsThreadId < mThreads.size()); - BUCKET_THREAD& bt = mThreads[tlsThreadId]; - - if (bt.level == 0) - { - return; - } - - uint64_t tsc = __rdtsc(); - - { - if (bt.pCurrent->start == 0) - return; - SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected"); - - bt.pCurrent->elapsed += (tsc - bt.pCurrent->start); - bt.pCurrent->count++; - - // pop to parent - bt.pCurrent = bt.pCurrent->pParent; - } - - bt.level--; - } - - INLINE void AddEvent(uint32_t id, uint32_t count) - { - if (!mCapturing) - return; - - SWR_ASSERT(tlsThreadId < mThreads.size()); - - BUCKET_THREAD& bt = mThreads[tlsThreadId]; - - // don't record events for threadviz - { - if (bt.pCurrent->children.size() < mBuckets.size()) - { - bt.pCurrent->children.resize(mBuckets.size()); - } - BUCKET& child = bt.pCurrent->children[id]; - child.pParent = bt.pCurrent; - child.id = id; - child.count += count; - } - } - -private: - void PrintBucket( - FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket); - void PrintThread(FILE* f, const BUCKET_THREAD& thread); - - // list of active threads that have registered with this manager - std::vector<BUCKET_THREAD> mThreads; - - // list of buckets registered with this manager - std::vector<BUCKET_DESC> mBuckets; - - // is capturing currently enabled - volatile bool mCapturing{false}; - - // has capturing completed - volatile bool mDoneCapturing{false}; - - std::mutex mThreadMutex; - - std::string mThreadVizDir; - -}; - -// C helpers for jitter -void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id); -void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id); diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h deleted file mode 100644 index fd3b1df746a..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h +++ /dev/null @@ -1,169 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file rdtsc_buckets.h - * - * @brief declaration for rdtsc buckets. - * - * Notes: - * - ******************************************************************************/ -#pragma once - -#include <vector> -#include <cassert> - -struct BUCKET -{ - uint32_t id{0}; - uint64_t start{0}; - uint64_t elapsed{0}; - uint32_t count{0}; - - BUCKET* pParent{nullptr}; - std::vector<BUCKET> children; -}; - -struct BUCKET_DESC -{ - // name of bucket, used in reports - std::string name; - - // description of bucket, used in threadviz - std::string description; - - // enable for threadviz dumping - bool enableThreadViz; - - // threadviz color of bucket, in RGBA8_UNORM format - uint32_t color; -}; - - -struct BUCKET_THREAD -{ - // name of thread, used in reports - std::string name; - - // id for this thread, assigned by the thread manager - uint32_t id{0}; - - // root of the bucket hierarchy for this thread - BUCKET root; - - // currently executing bucket somewhere in the hierarchy - BUCKET* pCurrent{nullptr}; - - // currently executing hierarchy level - uint32_t level{0}; - - // threadviz file object - FILE* vizFile{nullptr}; - - - BUCKET_THREAD() {} - BUCKET_THREAD(const BUCKET_THREAD& that) - { - name = that.name; - id = that.id; - root = that.root; - pCurrent = &root; - vizFile = that.vizFile; - } -}; - -enum VIZ_TYPE -{ - VIZ_START = 0, - VIZ_STOP = 1, - VIZ_DATA = 2 -}; - -struct VIZ_START_DATA -{ - uint8_t type; - uint32_t bucketId; - uint64_t timestamp; -}; - -struct VIZ_STOP_DATA -{ - uint8_t type; - uint64_t timestamp; -}; - -inline void Serialize(FILE* f, const VIZ_START_DATA& data) -{ - fwrite(&data, sizeof(VIZ_START_DATA), 1, f); -} - -inline void Deserialize(FILE* f, VIZ_START_DATA& data) -{ - fread(&data, sizeof(VIZ_START_DATA), 1, f); - assert(data.type == VIZ_START); -} - -inline void Serialize(FILE* f, const VIZ_STOP_DATA& data) -{ - fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f); -} - -inline void Deserialize(FILE* f, VIZ_STOP_DATA& data) -{ - fread(&data, sizeof(VIZ_STOP_DATA), 1, f); - assert(data.type == VIZ_STOP); -} - -inline void Serialize(FILE* f, const std::string& string) -{ - assert(string.size() <= 256); - - uint8_t length = (uint8_t)string.size(); - fwrite(&length, sizeof(length), 1, f); - fwrite(string.c_str(), string.size(), 1, f); -} - -inline void Deserialize(FILE* f, std::string& string) -{ - char cstr[256]; - uint8_t length; - fread(&length, sizeof(length), 1, f); - fread(cstr, length, 1, f); - cstr[length] = 0; - string.assign(cstr); -} - -inline void Serialize(FILE* f, const BUCKET_DESC& desc) -{ - Serialize(f, desc.name); - Serialize(f, desc.description); - fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f); - fwrite(&desc.color, sizeof(desc.color), 1, f); -} - -inline void Deserialize(FILE* f, BUCKET_DESC& desc) -{ - Deserialize(f, desc.name); - Deserialize(f, desc.description); - fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f); - fread(&desc.color, sizeof(desc.color), 1, f); -} diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h deleted file mode 100644 index 5964edff4d3..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ /dev/null @@ -1,168 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#ifndef __SWR_SIMD16INTRIN_H__ -#define __SWR_SIMD16INTRIN_H__ - -#if KNOB_SIMD16_WIDTH == 16 -typedef SIMD512 SIMD16; -#else -#error Unsupported vector width -#endif // KNOB_SIMD16_WIDTH == 16 - -#define _simd16_setzero_ps SIMD16::setzero_ps -#define _simd16_setzero_si SIMD16::setzero_si -#define _simd16_set1_ps SIMD16::set1_ps -#define _simd16_set1_epi8 SIMD16::set1_epi8 -#define _simd16_set1_epi32 SIMD16::set1_epi32 -#define _simd16_set_ps SIMD16::set_ps -#define _simd16_set_epi32 SIMD16::set_epi32 -#define _simd16_load_ps SIMD16::load_ps -#define _simd16_loadu_ps SIMD16::loadu_ps -#if 1 -#define _simd16_load1_ps SIMD16::broadcast_ss -#endif -#define _simd16_load_si SIMD16::load_si -#define _simd16_loadu_si SIMD16::loadu_si -#define _simd16_broadcast_ss(m) SIMD16::broadcast_ss((float const*)m) -#define _simd16_store_ps SIMD16::store_ps -#define _simd16_store_si SIMD16::store_si -#define _simd16_extract_ps(a, imm8) SIMD16::extract_ps<imm8>(a) -#define _simd16_extract_si(a, imm8) SIMD16::extract_si<imm8>(a) -#define _simd16_insert_ps(a, b, imm8) SIMD16::insert_ps<imm8>(a, b) -#define _simd16_insert_si(a, b, imm8) SIMD16::insert_si<imm8>(a, b) -#define _simd16_maskstore_ps SIMD16::maskstore_ps -#define _simd16_blend_ps(a, b, mask) SIMD16::blend_ps<mask>(a, b) -#define _simd16_blendv_ps SIMD16::blendv_ps -#define _simd16_blendv_epi32 SIMD16::blendv_epi32 -#define _simd16_mul_ps SIMD16::mul_ps -#define _simd16_div_ps SIMD16::div_ps -#define _simd16_add_ps SIMD16::add_ps -#define _simd16_sub_ps SIMD16::sub_ps -#define _simd16_rsqrt_ps SIMD16::rsqrt_ps -#define _simd16_min_ps SIMD16::min_ps -#define _simd16_max_ps SIMD16::max_ps -#define _simd16_movemask_ps SIMD16::movemask_ps -#define _simd16_movemask_pd SIMD16::movemask_pd -#define _simd16_cvtps_epi32 SIMD16::cvtps_epi32 -#define _simd16_cvttps_epi32 SIMD16::cvttps_epi32 -#define _simd16_cvtepi32_ps SIMD16::cvtepi32_ps -#define _simd16_cmp_ps(a, b, comp) SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b) -#define _simd16_cmplt_ps SIMD16::cmplt_ps -#define _simd16_cmpgt_ps SIMD16::cmpgt_ps -#define _simd16_cmpneq_ps SIMD16::cmpneq_ps -#define _simd16_cmpeq_ps SIMD16::cmpeq_ps -#define _simd16_cmpge_ps SIMD16::cmpge_ps -#define _simd16_cmple_ps SIMD16::cmple_ps -#define _simd16_castsi_ps SIMD16::castsi_ps -#define _simd16_castps_si SIMD16::castps_si -#define _simd16_castsi_pd SIMD16::castsi_pd -#define _simd16_castpd_si SIMD16::castpd_si -#define _simd16_castpd_ps SIMD16::castpd_ps -#define _simd16_castps_pd SIMD16::castps_pd -#define _simd16_and_ps SIMD16::and_ps -#define _simd16_andnot_ps SIMD16::andnot_ps -#define _simd16_or_ps SIMD16::or_ps -#define _simd16_xor_ps SIMD16::xor_ps -#define _simd16_round_ps(a, mode) SIMD16::round_ps<SIMD16::RoundMode(mode)>(a) -#define _simd16_mul_epi32 SIMD16::mul_epi32 -#define _simd16_mullo_epi32 SIMD16::mullo_epi32 -#define _simd16_sub_epi32 SIMD16::sub_epi32 -#define _simd16_sub_epi64 SIMD16::sub_epi64 -#define _simd16_min_epi32 SIMD16::min_epi32 -#define _simd16_max_epi32 SIMD16::max_epi32 -#define _simd16_min_epu32 SIMD16::min_epu32 -#define _simd16_max_epu32 SIMD16::max_epu32 -#define _simd16_add_epi32 SIMD16::add_epi32 -#define _simd16_and_si SIMD16::and_si -#define _simd16_andnot_si SIMD16::andnot_si -#define _simd16_or_si SIMD16::or_si -#define _simd16_xor_si SIMD16::xor_si -#define _simd16_cmpeq_epi32 SIMD16::cmpeq_epi32 -#define _simd16_cmpgt_epi32 SIMD16::cmpgt_epi32 -#define _simd16_cmplt_epi32 SIMD16::cmplt_epi32 -#define _simd16_testz_ps SIMD16::testz_ps -#define _simd16_unpacklo_ps SIMD16::unpacklo_ps -#define _simd16_unpackhi_ps SIMD16::unpackhi_ps -#define _simd16_unpacklo_pd SIMD16::unpacklo_pd -#define _simd16_unpackhi_pd SIMD16::unpackhi_pd -#define _simd16_unpacklo_epi8 SIMD16::unpacklo_epi8 -#define _simd16_unpackhi_epi8 SIMD16::unpackhi_epi8 -#define _simd16_unpacklo_epi16 SIMD16::unpacklo_epi16 -#define _simd16_unpackhi_epi16 SIMD16::unpackhi_epi16 -#define _simd16_unpacklo_epi32 SIMD16::unpacklo_epi32 -#define _simd16_unpackhi_epi32 SIMD16::unpackhi_epi32 -#define _simd16_unpacklo_epi64 SIMD16::unpacklo_epi64 -#define _simd16_unpackhi_epi64 SIMD16::unpackhi_epi64 -#define _simd16_slli_epi32(a, i) SIMD16::slli_epi32<i>(a) -#define _simd16_srli_epi32(a, i) SIMD16::srli_epi32<i>(a) -#define _simd16_srai_epi32(a, i) SIMD16::srai_epi32<i>(a) -#define _simd16_fmadd_ps SIMD16::fmadd_ps -#define _simd16_fmsub_ps SIMD16::fmsub_ps -#define _simd16_adds_epu8 SIMD16::adds_epu8 -#define _simd16_subs_epu8 SIMD16::subs_epu8 -#define _simd16_add_epi8 SIMD16::add_epi8 -#define _simd16_shuffle_epi8 SIMD16::shuffle_epi8 - -#define _simd16_i32gather_ps(m, index, scale) \ - SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(m, index) -#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) \ - SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask) - -#define _simd16_abs_epi32 SIMD16::abs_epi32 - -#define _simd16_cmpeq_epi64 SIMD16::cmpeq_epi64 -#define _simd16_cmpgt_epi64 SIMD16::cmpgt_epi64 -#define _simd16_cmpeq_epi16 SIMD16::cmpeq_epi16 -#define _simd16_cmpgt_epi16 SIMD16::cmpgt_epi16 -#define _simd16_cmpeq_epi8 SIMD16::cmpeq_epi8 -#define _simd16_cmpgt_epi8 SIMD16::cmpgt_epi8 - -#define _simd16_permute_ps_i(a, i) SIMD16::permute_ps<i>(a) -#define _simd16_permute_ps SIMD16::permute_ps -#define _simd16_permute_epi32 SIMD16::permute_epi32 -#define _simd16_sllv_epi32 SIMD16::sllv_epi32 -#define _simd16_srlv_epi32 SIMD16::sllv_epi32 -#define _simd16_permute2f128_ps(a, b, i) SIMD16::permute2f128_ps<i>(a, b) -#define _simd16_permute2f128_pd(a, b, i) SIMD16::permute2f128_pd<i>(a, b) -#define _simd16_permute2f128_si(a, b, i) SIMD16::permute2f128_si<i>(a, b) -#define _simd16_shuffle_ps(a, b, i) SIMD16::shuffle_ps<i>(a, b) -#define _simd16_shuffle_pd(a, b, i) SIMD16::shuffle_pd<i>(a, b) -#define _simd16_shuffle_epi32(a, b, imm8) SIMD16::shuffle_epi32<imm8>(a, b) -#define _simd16_shuffle_epi64(a, b, imm8) SIMD16::shuffle_epi64<imm8>(a, b) -#define _simd16_cvtepu8_epi16 SIMD16::cvtepu8_epi16 -#define _simd16_cvtepu8_epi32 SIMD16::cvtepu8_epi32 -#define _simd16_cvtepu16_epi32 SIMD16::cvtepu16_epi32 -#define _simd16_cvtepu16_epi64 SIMD16::cvtepu16_epi64 -#define _simd16_cvtepu32_epi64 SIMD16::cvtepu32_epi64 -#define _simd16_packus_epi16 SIMD16::packus_epi16 -#define _simd16_packs_epi16 SIMD16::packs_epi16 -#define _simd16_packus_epi32 SIMD16::packus_epi32 -#define _simd16_packs_epi32 SIMD16::packs_epi32 -#define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ> -#define _simd16_cmpeq_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::EQ_OQ> -#define _simd16_int2mask(mask) simd16mask(mask) -#define _simd16_mask2int(mask) int(mask) -#define _simd16_vmask_ps SIMD16::vmask_ps - -#endif //__SWR_SIMD16INTRIN_H_ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h deleted file mode 100644 index ebb4f4b7f11..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ /dev/null @@ -1,322 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#ifndef __SWR_SIMDINTRIN_H__ -#define __SWR_SIMDINTRIN_H__ - -#include "common/intrin.h" -#include "common/simdlib.hpp" - -#if KNOB_SIMD_WIDTH == 8 -typedef SIMD256 SIMD; -#else -#error Unsupported vector width -#endif // KNOB_SIMD16_WIDTH == 16 - -#define _simd128_maskstore_ps SIMD128::maskstore_ps -#define _simd128_fmadd_ps SIMD128::fmadd_ps - -#define _simd_load_ps SIMD::load_ps -#define _simd_load1_ps SIMD::broadcast_ss -#define _simd_loadu_ps SIMD::loadu_ps -#define _simd_setzero_ps SIMD::setzero_ps -#define _simd_set1_ps SIMD::set1_ps -#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b) -#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b) -#define _simd_blendv_ps SIMD::blendv_ps -#define _simd_store_ps SIMD::store_ps -#define _simd_mul_ps SIMD::mul_ps -#define _simd_add_ps SIMD::add_ps -#define _simd_sub_ps SIMD::sub_ps -#define _simd_rsqrt_ps SIMD::rsqrt_ps -#define _simd_min_ps SIMD::min_ps -#define _simd_max_ps SIMD::max_ps -#define _simd_movemask_ps SIMD::movemask_ps -#define _simd_cvtps_epi32 SIMD::cvtps_epi32 -#define _simd_cvttps_epi32 SIMD::cvttps_epi32 -#define _simd_cvtepi32_ps SIMD::cvtepi32_ps -#define _simd_cmplt_ps SIMD::cmplt_ps -#define _simd_cmpgt_ps SIMD::cmpgt_ps -#define _simd_cmpneq_ps SIMD::cmpneq_ps -#define _simd_cmpeq_ps SIMD::cmpeq_ps -#define _simd_cmpge_ps SIMD::cmpge_ps -#define _simd_cmple_ps SIMD::cmple_ps -#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b) -#define _simd_and_ps SIMD::and_ps -#define _simd_or_ps SIMD::or_ps -#define _simd_rcp_ps SIMD::rcp_ps -#define _simd_div_ps SIMD::div_ps -#define _simd_castsi_ps SIMD::castsi_ps -#define _simd_castps_pd SIMD::castps_pd -#define _simd_castpd_ps SIMD::castpd_ps -#define _simd_andnot_ps SIMD::andnot_ps -#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a) -#define _simd_castpd_ps SIMD::castpd_ps -#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const*)(a)) -#define _simd_stream_ps SIMD::stream_ps - -#define _simd_movemask_pd SIMD::movemask_pd -#define _simd_castsi_pd SIMD::castsi_pd - -#define _simd_mul_epi32 SIMD::mul_epi32 -#define _simd_mullo_epi32 SIMD::mullo_epi32 -#define _simd_sub_epi32 SIMD::sub_epi32 -#define _simd_sub_epi64 SIMD::sub_epi64 -#define _simd_min_epi32 SIMD::min_epi32 -#define _simd_min_epu32 SIMD::min_epu32 -#define _simd_max_epi32 SIMD::max_epi32 -#define _simd_max_epu32 SIMD::max_epu32 -#define _simd_add_epi32 SIMD::add_epi32 -#define _simd_and_si SIMD::and_si -#define _simd_andnot_si SIMD::andnot_si -#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32 -#define _simd_cmplt_epi32 SIMD::cmplt_epi32 -#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32 -#define _simd_or_si SIMD::or_si -#define _simd_xor_si SIMD::xor_si -#define _simd_castps_si SIMD::castps_si -#define _simd_adds_epu8 SIMD::adds_epu8 -#define _simd_subs_epu8 SIMD::subs_epu8 -#define _simd_add_epi8 SIMD::add_epi8 -#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64 -#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64 -#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8 -#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8 -#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16 -#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16 -#define _simd_movemask_epi8 SIMD::movemask_epi8 -#define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a) -#define _simd_permute_ps SIMD::permute_ps -#define _simd_permute_epi32 SIMD::permute_epi32 -#define _simd_srlv_epi32 SIMD::srlv_epi32 -#define _simd_sllv_epi32 SIMD::sllv_epi32 - -#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8 -#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8 -#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16 -#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16 -#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32 -#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32 -#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64 -#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64 - -#define _simd_slli_epi32(a, i) SIMD::slli_epi32<i>(a) -#define _simd_srai_epi32(a, i) SIMD::srai_epi32<i>(a) -#define _simd_srli_epi32(a, i) SIMD::srli_epi32<i>(a) -#define _simd_srlisi_ps(a, i) SIMD::srlisi_ps<i>(a) - -#define _simd_fmadd_ps SIMD::fmadd_ps -#define _simd_fmsub_ps SIMD::fmsub_ps -#define _simd_shuffle_epi8 SIMD::shuffle_epi8 - -#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o) -#define _simd_mask_i32gather_ps(r, p, o, m, s) \ - SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m) -#define _simd_abs_epi32 SIMD::abs_epi32 - -#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16 -#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32 -#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32 -#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64 -#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64 - -#define _simd_packus_epi16 SIMD::packus_epi16 -#define _simd_packs_epi16 SIMD::packs_epi16 -#define _simd_packus_epi32 SIMD::packus_epi32 -#define _simd_packs_epi32 SIMD::packs_epi32 - -#define _simd_unpacklo_ps SIMD::unpacklo_ps -#define _simd_unpackhi_ps SIMD::unpackhi_ps -#define _simd_unpacklo_pd SIMD::unpacklo_pd -#define _simd_unpackhi_pd SIMD::unpackhi_pd -#define _simd_insertf128_ps SIMD::insertf128_ps -#define _simd_insertf128_pd SIMD::insertf128_pd -#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b) -#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a) -#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a) -#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a) -#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b) -#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b) -#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b) -#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b) -#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b) -#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b) -#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b) -#define _simd_set1_epi32 SIMD::set1_epi32 -#define _simd_set_epi32 SIMD::set_epi32 -#define _simd_set_ps SIMD::set_ps -#define _simd_set1_epi8 SIMD::set1_epi8 -#define _simd_setzero_si SIMD::setzero_si -#define _simd_cvttps_epi32 SIMD::cvttps_epi32 -#define _simd_store_si SIMD::store_si -#define _simd_broadcast_ss SIMD::broadcast_ss -#define _simd_maskstore_ps SIMD::maskstore_ps -#define _simd_load_si SIMD::load_si -#define _simd_loadu_si SIMD::loadu_si -#define _simd_sub_ps SIMD::sub_ps -#define _simd_testz_ps SIMD::testz_ps -#define _simd_testz_si SIMD::testz_si -#define _simd_xor_ps SIMD::xor_ps - -#define _simd_loadu2_si SIMD::loadu2_si -#define _simd_storeu2_si SIMD::storeu2_si - -#define _simd_blendv_epi32 SIMD::blendv_epi32 -#define _simd_vmask_ps SIMD::vmask_ps - -template <int mask> -SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD128::Integer const& b) -{ - return SIMD128::castps_si( - SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b))); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Compute plane equation vA * vX + vB * vY + vC -SIMDINLINE simdscalar vplaneps(simdscalar const& vA, - simdscalar const& vB, - simdscalar const& vC, - simdscalar const& vX, - simdscalar const& vY) -{ - simdscalar vOut = _simd_fmadd_ps(vA, vX, vC); - vOut = _simd_fmadd_ps(vB, vY, vOut); - return vOut; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Compute plane equation vA * vX + vB * vY + vC -SIMDINLINE simd4scalar vplaneps(simd4scalar const& vA, - simd4scalar const& vB, - simd4scalar const& vC, - simd4scalar const& vX, - simd4scalar const& vY) -{ - simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC); - vOut = _simd128_fmadd_ps(vB, vY, vOut); - return vOut; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Interpolates a single component. -/// @param vI - barycentric I -/// @param vJ - barycentric J -/// @param pInterpBuffer - pointer to attribute barycentric coeffs -template <UINT Attrib, UINT Comp, UINT numComponents = 4> -static SIMDINLINE simdscalar InterpolateComponent(simdscalar const& vI, - simdscalar const& vJ, - const float* pInterpBuffer) -{ - const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; - const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp]; - const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp]; - - if ((pInterpA[0] == pInterpB[0]) && (pInterpA[0] == pInterpC[0])) - { - // Ensure constant attribs are constant. Required for proper - // 3D resource copies. - return _simd_broadcast_ss(pInterpA); - } - - simdscalar vA = _simd_broadcast_ss(pInterpA); - simdscalar vB = _simd_broadcast_ss(pInterpB); - simdscalar vC = _simd_broadcast_ss(pInterpC); - - simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ); - vC = _simd_mul_ps(vk, vC); - - return vplaneps(vA, vB, vC, vI, vJ); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Interpolates a single component (flat shade). -/// @param pInterpBuffer - pointer to attribute barycentric coeffs -template <UINT Attrib, UINT Comp, UINT numComponents = 4> -static SIMDINLINE simdscalar InterpolateComponentFlat(const float* pInterpBuffer) -{ - const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; - - simdscalar vA = _simd_broadcast_ss(pInterpA); - - return vA; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Interpolates a single component (flat shade). -/// @param pInterpBuffer - pointer to attribute barycentric coeffs -template <UINT Attrib, UINT Comp, UINT numComponents = 4> -static SIMDINLINE simdscalari InterpolateComponentFlatInt(const uint32_t* pInterpBuffer) -{ - const uint32_t interpA = pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; - - simdscalari vA = _simd_set1_epi32(interpA); - - return vA; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Interpolates a single component. -/// @param vI - barycentric I -/// @param vJ - barycentric J -/// @param pInterpBuffer - pointer to attribute barycentric coeffs -template <UINT Attrib, UINT Comp, UINT numComponents = 4> -static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const& vI, - simd4scalar const& vJ, - const float* pInterpBuffer) -{ - const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; - const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp]; - const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp]; - - if ((pInterpA[0] == pInterpB[0]) && (pInterpA[0] == pInterpC[0])) - { - // Ensure constant attribs are constant. Required for proper - // 3D resource copies. - return SIMD128::broadcast_ss(pInterpA); - } - - simd4scalar vA = SIMD128::broadcast_ss(pInterpA); - simd4scalar vB = SIMD128::broadcast_ss(pInterpB); - simd4scalar vC = SIMD128::broadcast_ss(pInterpC); - - simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ); - vC = SIMD128::mul_ps(vk, vC); - - return vplaneps(vA, vB, vC, vI, vJ); -} - -static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const& a) -{ - simd4scalari ai = SIMD128::castps_si(a); - return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff))); -} - -static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a) -{ - simdscalari ai = _simd_castps_si(a); - return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff))); -} - -#include "simd16intrin.h" - -#endif //__SWR_SIMDINTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp deleted file mode 100644 index 53793ba101c..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp +++ /dev/null @@ -1,234 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#pragma once - -#include "simdlib_types.hpp" - -// For documentation, please see the following include... -// #include "simdlib_interface.hpp" - -namespace SIMDImpl -{ - namespace SIMD128Impl - { -#if SIMD_ARCH >= SIMD_ARCH_AVX - struct AVXImpl - { -#define __SIMD_LIB_AVX_HPP__ -#include "simdlib_128_avx.inl" -#undef __SIMD_LIB_AVX_HPP__ - }; // struct AVXImpl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX - -#if SIMD_ARCH >= SIMD_ARCH_AVX2 - struct AVX2Impl : AVXImpl - { -#define __SIMD_LIB_AVX2_HPP__ -#include "simdlib_128_avx2.inl" -#undef __SIMD_LIB_AVX2_HPP__ - }; // struct AVX2Impl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 - -#if SIMD_ARCH >= SIMD_ARCH_AVX512 - struct AVX512Impl : AVX2Impl - { -#if defined(SIMD_OPT_128_AVX512) -#define __SIMD_LIB_AVX512_HPP__ -#include "simdlib_128_avx512.inl" -#if defined(SIMD_ARCH_KNIGHTS) -#include "simdlib_128_avx512_knights.inl" -#else // optimize for core -#include "simdlib_128_avx512_core.inl" -#endif // defined(SIMD_ARCH_KNIGHTS) -#undef __SIMD_LIB_AVX512_HPP__ -#endif // SIMD_OPT_128_AVX512 - }; // struct AVX2Impl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 - - struct Traits : SIMDImpl::Traits - { -#if SIMD_ARCH == SIMD_ARCH_AVX - using IsaImpl = AVXImpl; -#elif SIMD_ARCH == SIMD_ARCH_AVX2 - using IsaImpl = AVX2Impl; -#elif SIMD_ARCH == SIMD_ARCH_AVX512 - using IsaImpl = AVX512Impl; -#else -#error Invalid value for SIMD_ARCH -#endif - - using Float = SIMD128Impl::Float; - using Double = SIMD128Impl::Double; - using Integer = SIMD128Impl::Integer; - using Vec4 = SIMD128Impl::Vec4; - using Mask = SIMD128Impl::Mask; - }; - } // namespace SIMD128Impl - - namespace SIMD256Impl - { -#if SIMD_ARCH >= SIMD_ARCH_AVX - struct AVXImpl - { -#define __SIMD_LIB_AVX_HPP__ -#include "simdlib_256_avx.inl" -#undef __SIMD_LIB_AVX_HPP__ - }; // struct AVXImpl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX - -#if SIMD_ARCH >= SIMD_ARCH_AVX2 - struct AVX2Impl : AVXImpl - { -#define __SIMD_LIB_AVX2_HPP__ -#include "simdlib_256_avx2.inl" -#undef __SIMD_LIB_AVX2_HPP__ - }; // struct AVX2Impl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 - -#if SIMD_ARCH >= SIMD_ARCH_AVX512 - struct AVX512Impl : AVX2Impl - { -#if defined(SIMD_OPT_256_AVX512) -#define __SIMD_LIB_AVX512_HPP__ -#include "simdlib_256_avx512.inl" -#if defined(SIMD_ARCH_KNIGHTS) -#include "simdlib_256_avx512_knights.inl" -#else // optimize for core -#include "simdlib_256_avx512_core.inl" -#endif // defined(SIMD_ARCH_KNIGHTS) -#undef __SIMD_LIB_AVX512_HPP__ -#endif // SIMD_OPT_256_AVX512 - }; // struct AVX2Impl -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 - - struct Traits : SIMDImpl::Traits - { -#if SIMD_ARCH == SIMD_ARCH_AVX - using IsaImpl = AVXImpl; -#elif SIMD_ARCH == SIMD_ARCH_AVX2 - using IsaImpl = AVX2Impl; -#elif SIMD_ARCH == SIMD_ARCH_AVX512 - using IsaImpl = AVX512Impl; -#else -#error Invalid value for SIMD_ARCH -#endif - - using Float = SIMD256Impl::Float; - using Double = SIMD256Impl::Double; - using Integer = SIMD256Impl::Integer; - using Vec4 = SIMD256Impl::Vec4; - using Mask = SIMD256Impl::Mask; - }; - } // namespace SIMD256Impl - - namespace SIMD512Impl - { -#if SIMD_ARCH >= SIMD_ARCH_AVX - template <typename SIMD256T> - struct AVXImplBase - { -#define __SIMD_LIB_AVX_HPP__ -#include "simdlib_512_emu.inl" -#include "simdlib_512_emu_masks.inl" -#undef __SIMD_LIB_AVX_HPP__ - }; // struct AVXImplBase - using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>; -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX - -#if SIMD_ARCH >= SIMD_ARCH_AVX2 - using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>; -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 - -#if SIMD_ARCH >= SIMD_ARCH_AVX512 - struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl> - { -#define __SIMD_LIB_AVX512_HPP__ -#include "simdlib_512_avx512.inl" -#include "simdlib_512_avx512_masks.inl" -#if defined(SIMD_ARCH_KNIGHTS) -#include "simdlib_512_avx512_knights.inl" -#include "simdlib_512_avx512_masks_knights.inl" -#else // optimize for core -#include "simdlib_512_avx512_core.inl" -#include "simdlib_512_avx512_masks_core.inl" -#endif // defined(SIMD_ARCH_KNIGHTS) -#undef __SIMD_LIB_AVX512_HPP__ - }; // struct AVX512ImplBase -#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 - - struct Traits : SIMDImpl::Traits - { -#if SIMD_ARCH == SIMD_ARCH_AVX - using IsaImpl = AVXImpl; -#elif SIMD_ARCH == SIMD_ARCH_AVX2 - using IsaImpl = AVX2Impl; -#elif SIMD_ARCH == SIMD_ARCH_AVX512 - using IsaImpl = AVX512Impl; -#else -#error Invalid value for SIMD_ARCH -#endif - - using Float = SIMD512Impl::Float; - using Double = SIMD512Impl::Double; - using Integer = SIMD512Impl::Integer; - using Vec4 = SIMD512Impl::Vec4; - using Mask = SIMD512Impl::Mask; - }; - } // namespace SIMD512Impl -} // namespace SIMDImpl - -template <typename Traits> -struct SIMDBase : Traits::IsaImpl -{ - using CompareType = typename Traits::CompareType; - using ScaleFactor = typename Traits::ScaleFactor; - using RoundMode = typename Traits::RoundMode; - using SIMD = typename Traits::IsaImpl; - using Float = typename Traits::Float; - using Double = typename Traits::Double; - using Integer = typename Traits::Integer; - using Vec4 = typename Traits::Vec4; - using Mask = typename Traits::Mask; -}; // struct SIMDBase - -using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>; -using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>; -using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>; - -template <typename SIMD_T> -using CompareType = typename SIMD_T::CompareType; -template <typename SIMD_T> -using ScaleFactor = typename SIMD_T::ScaleFactor; -template <typename SIMD_T> -using RoundMode = typename SIMD_T::RoundMode; -template <typename SIMD_T> -using Float = typename SIMD_T::Float; -template <typename SIMD_T> -using Double = typename SIMD_T::Double; -template <typename SIMD_T> -using Integer = typename SIMD_T::Integer; -template <typename SIMD_T> -using Vec4 = typename SIMD_T::Vec4; -template <typename SIMD_T> -using Mask = typename SIMD_T::Mask; - diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl deleted file mode 100644 index 83ce967373c..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl +++ /dev/null @@ -1,593 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD128 AVX (1) implementation -//============================================================================ - -#define SIMD_WRAPPER_1(op) \ - static SIMDINLINE Float SIMDCALL op(Float a) { return _mm_##op(a); } - -#define SIMD_WRAPPER_2(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm_##op(a, b); } - -#define SIMD_DWRAPPER_2(op) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm_##op(a, b); } - -#define SIMD_WRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return _mm_##op(a, b, ImmT); \ - } - -#define SIMD_DWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - { \ - return _mm_##op(a, b, ImmT); \ - } - -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); } - -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm_##op(a); } - -#define SIMD_IWRAPPER_1I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return intrin(a, ImmT); \ - } -#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op) - -#define SIMD_IWRAPPER_2_(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return intrin(a, b); } - -#define SIMD_IWRAPPER_2(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm_##op(a, b); } - -#define SIMD_IFWRAPPER_2(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return castps_si(intrin(castsi_ps(a), castsi_ps(b))); \ - } - -#define SIMD_IWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return _mm_##op(a, b, ImmT); \ - } - -//----------------------------------------------------------------------- -// Single precision floating point arithmetic operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b -SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a -SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b - -static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c -{ - return add_ps(mul_ps(a, b), c); -} -static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c -{ - return sub_ps(mul_ps(a, b), c); -} - -template <RoundMode RMT> -static SIMDINLINE Float SIMDCALL round_ps(Float a) -{ - return _mm_round_ps(a, static_cast<int>(RMT)); -} - -static SIMDINLINE Float SIMDCALL ceil_ps(Float a) -{ - return round_ps<RoundMode::CEIL_NOEXC>(a); -} -static SIMDINLINE Float SIMDCALL floor_ps(Float a) -{ - return round_ps<RoundMode::FLOOR_NOEXC>(a); -} - -//----------------------------------------------------------------------- -// Integer (various width) arithmetic operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) -SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) -SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) -SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) -SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) -SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) -SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) - -// return (a * b) & 0xFFFFFFFF -// -// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, -// and store the low 32 bits of the intermediate integers in dst. -SIMD_IWRAPPER_2(mullo_epi32); -SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) -SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) -SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) - -//----------------------------------------------------------------------- -// Logical operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) -SIMD_IWRAPPER_2_(and_si, _mm_and_si128); // return a & b (int) -SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) -SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b (int) -SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) -SIMD_IWRAPPER_2_(or_si, _mm_or_si128); // return a | b (int) -SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) -SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128); // return a ^ b (int) - -//----------------------------------------------------------------------- -// Shift operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT -SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT - -static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32) -{ - int32_t a, count; - a = _mm_extract_epi32(vA, 0); - count = _mm_extract_epi32(vB, 0); - a <<= count; - vA = _mm_insert_epi32(vA, a, 0); - - a = _mm_extract_epi32(vA, 1); - count = _mm_extract_epi32(vB, 1); - a <<= count; - vA = _mm_insert_epi32(vA, a, 1); - - a = _mm_extract_epi32(vA, 2); - count = _mm_extract_epi32(vB, 2); - a <<= count; - vA = _mm_insert_epi32(vA, a, 2); - - a = _mm_extract_epi32(vA, 3); - count = _mm_extract_epi32(vB, 3); - a <<= count; - vA = _mm_insert_epi32(vA, a, 3); - - return vA; -} - -SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) -SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) -SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint) - -static SIMDINLINE Integer SIMDCALL srl_epi64(Integer a, Integer n) -{ - return _mm_srl_epi64(a, n); -} - -template <int ImmT> // same as srli_si, but with Float cast to int -static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) -{ - return castsi_ps(srli_si<ImmT>(castps_si(a))); -} - -static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32) -{ - int32_t a, count; - a = _mm_extract_epi32(vA, 0); - count = _mm_extract_epi32(vB, 0); - a >>= count; - vA = _mm_insert_epi32(vA, a, 0); - - a = _mm_extract_epi32(vA, 1); - count = _mm_extract_epi32(vB, 1); - a >>= count; - vA = _mm_insert_epi32(vA, a, 1); - - a = _mm_extract_epi32(vA, 2); - count = _mm_extract_epi32(vB, 2); - a >>= count; - vA = _mm_insert_epi32(vA, a, 2); - - a = _mm_extract_epi32(vA, 3); - count = _mm_extract_epi32(vB, 3); - a >>= count; - vA = _mm_insert_epi32(vA, a, 3); - - return vA; -} - -//----------------------------------------------------------------------- -// Conversion operations -//----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a) -{ - return _mm_castpd_ps(a); -} - -static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a) -{ - return _mm_castps_si128(a); -} - -static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a) -{ - return _mm_castsi128_pd(a); -} - -static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a) -{ - return _mm_castps_pd(a); -} - -static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a) -{ - return _mm_castsi128_ps(a); -} - -static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float) -{ - return _mm_cvtepi32_ps(a); -} - -static SIMDINLINE int32_t SIMDCALL cvtsi128_si32(Integer a) // return a.v[0] -{ - return _mm_cvtsi128_si32(a); -} - -static SIMDINLINE Integer SIMDCALL cvtsi32_si128(int32_t n) // return a[0] = n, a[1]...a[3] = 0 -{ - return _mm_cvtsi32_si128(n); -} - -SIMD_IWRAPPER_1(cvtepu8_epi16); // return (int16)a (uint8 --> int16) -SIMD_IWRAPPER_1(cvtepu8_epi32); // return (int32)a (uint8 --> int32) -SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a (uint16 --> int32) -SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a (uint16 --> int64) -SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a (uint32 --> int64) - -static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32) -{ - return _mm_cvtps_epi32(a); -} - -static SIMDINLINE Integer SIMDCALL - cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32) -{ - return _mm_cvttps_epi32(a); -} - -//----------------------------------------------------------------------- -// Comparison operations -//----------------------------------------------------------------------- -template <CompareType CmpTypeT> -static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b -{ - return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT)); -} -static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) -{ - return cmp_ps<CompareType::LT_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) -{ - return cmp_ps<CompareType::GT_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) -{ - return cmp_ps<CompareType::NEQ_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) -{ - return cmp_ps<CompareType::EQ_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) -{ - return cmp_ps<CompareType::GE_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) -{ - return cmp_ps<CompareType::LE_OQ>(a, b); -} - -SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) -SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) -SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) -SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) -SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) -SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) -SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) -SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) -SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) - -static SIMDINLINE bool SIMDCALL testz_ps(Float a, - Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float) -{ - return 0 != _mm_testz_ps(a, b); -} - -static SIMDINLINE bool SIMDCALL testz_si(Integer a, - Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int) -{ - return 0 != _mm_testz_si128(a, b); -} - -//----------------------------------------------------------------------- -// Blend / shuffle / permute operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) -SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) - -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, - Integer b, - Float mask) // return mask ? b : a (int) -{ - return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); -} - -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, - Integer b, - Integer mask) // return mask ? b : a (int) -{ - return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); -} - -static SIMDINLINE Float SIMDCALL - broadcast_ss(float const* p) // return *p (all elements in vector get same value) -{ - return _mm_broadcast_ss(p); -} - -SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm_packs_epi16 and _mm512_packs_epi16 -SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm_packs_epi32 and _mm512_packs_epi32 -SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16 -SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32 - -static SIMDINLINE Integer SIMDCALL - permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) -{ - return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz)); -} - -static SIMDINLINE Float SIMDCALL - permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) -{ - return _mm_permutevar_ps(a, swiz); -} - -SIMD_IWRAPPER_1I(shuffle_epi32); - -template <int ImmT> -static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete; - -SIMD_IWRAPPER_2(shuffle_epi8); -SIMD_DWRAPPER_2I(shuffle_pd); -SIMD_WRAPPER_2I(shuffle_ps); -SIMD_IWRAPPER_2(unpackhi_epi16); - -// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps); -static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b) -{ - return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b))); -} - -SIMD_IWRAPPER_2(unpackhi_epi64); -SIMD_IWRAPPER_2(unpackhi_epi8); -SIMD_DWRAPPER_2(unpackhi_pd); -SIMD_WRAPPER_2(unpackhi_ps); -SIMD_IWRAPPER_2(unpacklo_epi16); -SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps); -SIMD_IWRAPPER_2(unpacklo_epi64); -SIMD_IWRAPPER_2(unpacklo_epi8); -SIMD_DWRAPPER_2(unpacklo_pd); -SIMD_WRAPPER_2(unpacklo_ps); - -//----------------------------------------------------------------------- -// Load / store operations -//----------------------------------------------------------------------- -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) -{ - uint32_t* pOffsets = (uint32_t*)&idx; - Float vResult; - float* pResult = (float*)&vResult; - for (uint32_t i = 0; i < SIMD_WIDTH; ++i) - { - uint32_t offset = pOffsets[i]; - offset = offset * static_cast<uint32_t>(ScaleT); - pResult[i] = *(float const*)(((uint8_t const*)p + offset)); - } - - return vResult; -} - -static SIMDINLINE Float SIMDCALL - load1_ps(float const* p) // return *p (broadcast 1 value to all elements) -{ - return broadcast_ss(p); -} - -static SIMDINLINE Float SIMDCALL - load_ps(float const* p) // return *p (loads SIMD width elements from memory) -{ - return _mm_load_ps(p); -} - -static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p -{ - return _mm_load_si128(&p->v); -} - -static SIMDINLINE Float SIMDCALL - loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) -{ - return _mm_loadu_ps(p); -} - -static SIMDINLINE Integer SIMDCALL - loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) -{ - return _mm_lddqu_si128(&p->v); -} - -// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) -{ - uint32_t* pOffsets = (uint32_t*)&idx; - Float vResult = old; - float* pResult = (float*)&vResult; - unsigned long index; - uint32_t umask = movemask_ps(mask); - while (_BitScanForward(&index, umask)) - { - umask &= ~(1 << index); - uint32_t offset = pOffsets[index]; - offset = offset * static_cast<uint32_t>(ScaleT); - pResult[index] = *(float const*)(((uint8_t const*)p + offset)); - } - - return vResult; -} - -static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src) -{ - _mm_maskstore_ps(p, mask, src); -} - -static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) -{ - return static_cast<uint32_t>(_mm_movemask_epi8(a)); -} - -static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a) -{ - return static_cast<uint32_t>(_mm_movemask_pd(a)); -} -static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a) -{ - return static_cast<uint32_t>(_mm_movemask_ps(a)); -} - -static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) -{ - return _mm_set1_epi32(i); -} - -static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) -{ - return _mm_set1_epi8(i); -} - -static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) -{ - return _mm_set1_ps(f); -} - -static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) -{ - return _mm_setzero_ps(); -} - -static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) -{ - return _mm_setzero_si128(); -} - -static SIMDINLINE void SIMDCALL - store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory) -{ - _mm_store_ps(p, a); -} - -static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a -{ - _mm_store_si128(&p->v, a); -} - -static SIMDINLINE void SIMDCALL - storeu_si(Integer* p, Integer a) // *p = a (same as store_si but allows for unaligned mem) -{ - _mm_storeu_si128(&p->v, a); -} - -static SIMDINLINE void SIMDCALL - stream_ps(float* p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache) -{ - _mm_stream_ps(p, a); -} - -static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0) -{ - return _mm_set_ps(in3, in2, in1, in0); -} - -static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0) -{ - return _mm_set_epi32(in3, in2, in1, in0); -} - -template <int ImmT> -static SIMDINLINE float SIMDCALL extract_ps(Float a) -{ - int tmp = _mm_extract_ps(a, ImmT); - return *reinterpret_cast<float*>(&tmp); -} - -static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) -{ - Integer vec = set1_epi32(mask); - const Integer bit = set_epi32(0x08, 0x04, 0x02, 0x01); - vec = and_si(vec, bit); - vec = cmplt_epi32(setzero_si(), vec); - return castsi_ps(vec); -} - -#undef SIMD_WRAPPER_1 -#undef SIMD_WRAPPER_2 -#undef SIMD_DWRAPPER_2 -#undef SIMD_DWRAPPER_2I -#undef SIMD_WRAPPER_2I -#undef SIMD_WRAPPER_3 -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_2 -#undef SIMD_IFWRAPPER_2 -#undef SIMD_IWRAPPER_2I -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_1I -#undef SIMD_IWRAPPER_1I_ -#undef SIMD_IWRAPPER_2 -#undef SIMD_IWRAPPER_2_ -#undef SIMD_IWRAPPER_2I diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl deleted file mode 100644 index 0da66ebb56c..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl +++ /dev/null @@ -1,66 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX2_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD4 AVX (2) implementation -// -// Since this implementation inherits from the AVX (1) implementation, -// the only operations below ones that replace AVX (1) operations. -// Only 2 shifts and 2 gathers were introduced with AVX 2 -// Also, add native support for FMA operations -//============================================================================ -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); } - -SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c -SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c - -static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32) -{ - return _mm_sllv_epi32(vA, vB); -} - -static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32) -{ - return _mm_srlv_epi32(vA, vB); -} - -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) -{ - return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT)); -} - -// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) -{ - return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT)); -} - -#undef SIMD_WRAPPER_3 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl deleted file mode 100644 index b076daa080a..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl +++ /dev/null @@ -1,368 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD128 AVX (512) implementation -// -// Since this implementation inherits from the AVX (2) implementation, -// the only operations below ones that replace AVX (2) operations. -// These use native AVX512 instructions with masking to enable a larger -// register set. -//============================================================================ - -private: -static SIMDINLINE __m512 __conv(Float r) -{ - return _mm512_castps128_ps512(r.v); -} -static SIMDINLINE __m512d __conv(Double r) -{ - return _mm512_castpd128_pd512(r.v); -} -static SIMDINLINE __m512i __conv(Integer r) -{ - return _mm512_castsi128_si512(r.v); -} -static SIMDINLINE Float __conv(__m512 r) -{ - return _mm512_castps512_ps128(r); -} -static SIMDINLINE Double __conv(__m512d r) -{ - return _mm512_castpd512_pd128(r); -} -static SIMDINLINE Integer __conv(__m512i r) -{ - return _mm512_castsi512_si128(r); -} - -public: -#define SIMD_WRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ - } -#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf)) - -#define SIMD_WRAPPER_1I_(op, intrin, mask) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ - } -#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf)) - -#define SIMD_WRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ - } -#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf)) - -#define SIMD_WRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \ - } - -#define SIMD_WRAPPER_3_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \ - } -#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf)) - -#define SIMD_DWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - { \ - return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \ - } - -#define SIMD_IWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ - } -#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf)) - -#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ - } -#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf)) - -#define SIMD_IWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ - } -#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf)) - -#define SIMD_IWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \ - } - -//----------------------------------------------------------------------- -// Single precision floating point arithmetic operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b -SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c -SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b -SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xf)); // return 1.0f / a -SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xf)); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b - -//----------------------------------------------------------------------- -// Integer (various width) arithmetic operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32) -SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32) -SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32) -SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32) -SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32) -SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32) -SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32) - -// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8) -// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) - -// return (a * b) & 0xFFFFFFFF -// -// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, -// and store the low 32 bits of the intermediate integers in dst. -SIMD_IWRAPPER_2_32(mullo_epi32); -SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32) - -// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64) -// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) - -//----------------------------------------------------------------------- -// Logical operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf)); // return a & b (int) -SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b (int) -SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf)); // return a | b (int) -SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf)); // return a ^ b (int) - -//----------------------------------------------------------------------- -// Shift operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT -SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32) -SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32) -SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32) -SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32) - -// use AVX2 version -// SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint) - -//----------------------------------------------------------------------- -// Conversion operations (Use AVX2 versions) -//----------------------------------------------------------------------- -// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16) -// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32) -// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32) -// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64) -// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64) - -//----------------------------------------------------------------------- -// Comparison operations (Use AVX2 versions -//----------------------------------------------------------------------- -// SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8) -// SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16) -// SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32) -// SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64) -// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8) -// SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16) -// SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32) -// SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64) -// -// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32) -//{ -// return cmpgt_epi32(b, a); -//} - -//----------------------------------------------------------------------- -// Blend / shuffle / permute operations -//----------------------------------------------------------------------- -// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 -// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation -// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> -// uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for -// _mm256_packus_epi32 and _mm512_packus_epi32 SIMD_IWRAPPER_2_(permute_epi32, -// permutevar8x32_epi32); - -// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for -// each 32-bit lane i (float) -//{ -// return _mm256_permutevar8x32_ps(a, swiz); -//} - -SIMD_IWRAPPER_1I_32(shuffle_epi32); -// template<int ImmT> -// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) -//{ -// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); -//} -// SIMD_IWRAPPER_2(shuffle_epi8); -SIMD_IWRAPPER_2_32(unpackhi_epi32); -SIMD_IWRAPPER_2_32(unpacklo_epi32); - -// SIMD_IWRAPPER_2_16(unpackhi_epi16); -// SIMD_IWRAPPER_2_64(unpackhi_epi64); -// SIMD_IWRAPPER_2_8(unpackhi_epi8); -// SIMD_IWRAPPER_2_16(unpacklo_epi16); -// SIMD_IWRAPPER_2_64(unpacklo_epi64); -// SIMD_IWRAPPER_2_8(unpacklo_epi8); - -//----------------------------------------------------------------------- -// Load / store operations -//----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL - load_ps(float const* p) // return *p (loads SIMD width elements from memory) -{ - return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p)); -} - -static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p -{ - return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p)); -} - -static SIMDINLINE Float SIMDCALL - loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) -{ - return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p)); -} - -static SIMDINLINE Integer SIMDCALL - loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) -{ - return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p)); -} - -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) -{ - return __conv(_mm512_mask_i32gather_ps( - _mm512_setzero_ps(), __mmask16(0xf), __conv(idx), p, static_cast<int>(ScaleT))); -} - -// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) -{ - __mmask16 m = 0xf; - m = _mm512_mask_test_epi32_mask( - m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000)); - return __conv( - _mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT))); -} - -// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) -// { -// __mmask64 m = 0xffffull; -// return static_cast<uint32_t>( -// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80))); -// } - -static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src) -{ - __mmask16 m = 0xf; - m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000)); - _mm512_mask_storeu_ps(p, m, __conv(src)); -} - -static SIMDINLINE void SIMDCALL - store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory) -{ - _mm512_mask_storeu_ps(p, __mmask16(0xf), __conv(a)); -} - -static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a -{ - _mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a)); -} - -static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) -{ - return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xf), -1))); -} - -//======================================================================= -// Legacy interface (available only in SIMD256 width) -//======================================================================= - -#undef SIMD_WRAPPER_1_ -#undef SIMD_WRAPPER_1 -#undef SIMD_WRAPPER_1I_ -#undef SIMD_WRAPPER_1I -#undef SIMD_WRAPPER_2_ -#undef SIMD_WRAPPER_2 -#undef SIMD_WRAPPER_2I -#undef SIMD_WRAPPER_3_ -#undef SIMD_WRAPPER_3 -#undef SIMD_DWRAPPER_1_ -#undef SIMD_DWRAPPER_1 -#undef SIMD_DWRAPPER_1I_ -#undef SIMD_DWRAPPER_1I -#undef SIMD_DWRAPPER_2_ -#undef SIMD_DWRAPPER_2 -#undef SIMD_DWRAPPER_2I -#undef SIMD_IWRAPPER_1_ -#undef SIMD_IWRAPPER_1_8 -#undef SIMD_IWRAPPER_1_16 -#undef SIMD_IWRAPPER_1_32 -#undef SIMD_IWRAPPER_1_64 -#undef SIMD_IWRAPPER_1I_ -#undef SIMD_IWRAPPER_1I_8 -#undef SIMD_IWRAPPER_1I_16 -#undef SIMD_IWRAPPER_1I_32 -#undef SIMD_IWRAPPER_1I_64 -#undef SIMD_IWRAPPER_2_ -#undef SIMD_IWRAPPER_2_8 -#undef SIMD_IWRAPPER_2_16 -#undef SIMD_IWRAPPER_2_32 -#undef SIMD_IWRAPPER_2_64 -#undef SIMD_IWRAPPER_2I -//#undef SIMD_IWRAPPER_2I_8 -//#undef SIMD_IWRAPPER_2I_16 -//#undef SIMD_IWRAPPER_2I_32 -//#undef SIMD_IWRAPPER_2I_64 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl deleted file mode 100644 index 16e59c4decb..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl +++ /dev/null @@ -1,196 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD128 AVX (512) implementation -// -// Since this implementation inherits from the AVX (2) implementation, -// the only operations below ones that replace AVX (2) operations. -// These use native AVX512 instructions with masking to enable a larger -// register set. -//============================================================================ - -#define SIMD_WRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ - } -#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf)) - -#define SIMD_WRAPPER_1I_(op, intrin, mask) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ - } -#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf)) - -#define SIMD_WRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ - } -#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf)) - -#define SIMD_WRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \ - } - -#define SIMD_WRAPPER_3_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \ - } -#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf)) - -#define SIMD_DWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Double SIMDCALL op(Double a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ - } -#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3)) - -#define SIMD_DWRAPPER_1I_(op, intrin, mask) \ - template <int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ - } -#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3)) - -#define SIMD_DWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ - } -#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3)) - -#define SIMD_DWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - { \ - return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \ - } - -#define SIMD_IWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ - } -#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull)) -#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff)) -#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3)) - -#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ - } -#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull)) -#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff)) -#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3)) - -#define SIMD_IWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ - } -#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull)) -#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff)) -#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3)) - -#define SIMD_IWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \ - } - -SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64) -SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) -SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and - // _mm512_packs_epi16 -SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and - // _mm512_packs_epi32 -SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and - // _mm512_packus_epi16 -SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and - // _mm512_packus_epi32 -SIMD_IWRAPPER_2_16(unpackhi_epi16); -SIMD_IWRAPPER_2_64(unpackhi_epi64); -SIMD_IWRAPPER_2_8(unpackhi_epi8); -SIMD_IWRAPPER_2_16(unpacklo_epi16); -SIMD_IWRAPPER_2_64(unpacklo_epi64); -SIMD_IWRAPPER_2_8(unpacklo_epi8); - -static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) -{ - __mmask64 m = 0xffffull; - return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80))); -} - -#undef SIMD_WRAPPER_1_ -#undef SIMD_WRAPPER_1 -#undef SIMD_WRAPPER_1I_ -#undef SIMD_WRAPPER_1I -#undef SIMD_WRAPPER_2_ -#undef SIMD_WRAPPER_2 -#undef SIMD_WRAPPER_2I -#undef SIMD_WRAPPER_3_ -#undef SIMD_WRAPPER_3 -#undef SIMD_DWRAPPER_1_ -#undef SIMD_DWRAPPER_1 -#undef SIMD_DWRAPPER_1I_ -#undef SIMD_DWRAPPER_1I -#undef SIMD_DWRAPPER_2_ -#undef SIMD_DWRAPPER_2 -#undef SIMD_DWRAPPER_2I -#undef SIMD_IWRAPPER_1_ -#undef SIMD_IWRAPPER_1_8 -#undef SIMD_IWRAPPER_1_16 -#undef SIMD_IWRAPPER_1_32 -#undef SIMD_IWRAPPER_1_64 -#undef SIMD_IWRAPPER_1I_ -#undef SIMD_IWRAPPER_1I_8 -#undef SIMD_IWRAPPER_1I_16 -#undef SIMD_IWRAPPER_1I_32 -#undef SIMD_IWRAPPER_1I_64 -#undef SIMD_IWRAPPER_2_ -#undef SIMD_IWRAPPER_2_8 -#undef SIMD_IWRAPPER_2_16 -#undef SIMD_IWRAPPER_2_32 -#undef SIMD_IWRAPPER_2_64 -#undef SIMD_IWRAPPER_2I -//#undef SIMD_IWRAPPER_2I_8 -//#undef SIMD_IWRAPPER_2I_16 -//#undef SIMD_IWRAPPER_2I_32 -//#undef SIMD_IWRAPPER_2I_64 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl deleted file mode 100644 index 1b6592e2003..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl +++ /dev/null @@ -1,34 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD128 AVX (512) implementation for Knights Family -// -// Since this implementation inherits from the AVX512Base implementation, -// the only operations below ones that replace AVX512F / AVX512CD operations -// These use native AVX512 instructions with masking to enable a larger -// register set. -//============================================================================ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl deleted file mode 100644 index d0c3ecd4cf3..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl +++ /dev/null @@ -1,826 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -using SIMD128T = SIMD128Impl::AVXImpl; - -//============================================================================ -// SIMD256 AVX (1) implementation -//============================================================================ - -#define SIMD_WRAPPER_1(op) \ - static SIMDINLINE Float SIMDCALL op(Float const& a) { return _mm256_##op(a); } - -#define SIMD_WRAPPER_2(op) \ - static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ - { \ - return _mm256_##op(a, b); \ - } - -#define SIMD_DWRAPPER_2(op) \ - static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \ - { \ - return _mm256_##op(a, b); \ - } - -#define SIMD_WRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ - { \ - return _mm256_##op(a, b, ImmT); \ - } - -#define SIMD_DWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \ - { \ - return _mm256_##op(a, b, ImmT); \ - } - -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \ - { \ - return _mm256_##op(a, b, c); \ - } - -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); } - -#define SIMD_IWRAPPER_2(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return _mm256_##op(a, b); \ - } - -#define SIMD_IFWRAPPER_2(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return castps_si(intrin(castsi_ps(a), castsi_ps(b))); \ - } - -#define SIMD_IFWRAPPER_2I(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return castps_si(intrin(castsi_ps(a), castsi_ps(b), ImmT)); \ - } - -#define SIMD_IWRAPPER_2I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return _mm256_##intrin(a, b, ImmT); \ - } -#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) - -#define SIMD_IWRAPPER_3(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \ - { \ - return _mm256_##op(a, b, c); \ - } - -// emulated integer simd -#define SIMD_EMU_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ - { \ - return Integer{ \ - SIMD128T::op(a.v4[0]), \ - SIMD128T::op(a.v4[1]), \ - }; \ - } -#define SIMD_EMU_IWRAPPER_1L(op, shift) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ - { \ - return Integer{ \ - SIMD128T::op(a.v4[0]), \ - SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \ - }; \ - } \ - static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer const& a) \ - { \ - return Integer{ \ - SIMD128T::op(a), \ - SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \ - }; \ - } - -#define SIMD_EMU_IWRAPPER_1I(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ - { \ - return Integer{ \ - SIMD128T::template op<ImmT>(a.v4[0]), \ - SIMD128T::template op<ImmT>(a.v4[1]), \ - }; \ - } - -#define SIMD_EMU_IWRAPPER_2(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return Integer{ \ - SIMD128T::op(a.v4[0], b.v4[0]), \ - SIMD128T::op(a.v4[1], b.v4[1]), \ - }; \ - } - -#define SIMD_EMU_IWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return Integer{ \ - SIMD128T::template op<ImmT>(a.v4[0], b.v[0]), \ - SIMD128T::template op<ImmT>(a.v4[1], b.v[1]), \ - }; \ - } - -//----------------------------------------------------------------------- -// Single precision floating point arithmetic operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b - -static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a, - Float const& b, - Float const& c) // return (a * b) + c -{ - return add_ps(mul_ps(a, b), c); -} - -static SIMDINLINE Float SIMDCALL fmsub_ps(Float const& a, - Float const& b, - Float const& c) // return (a * b) - c -{ - return sub_ps(mul_ps(a, b), c); -} - -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b -SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a -SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b - -template <RoundMode RMT> -static SIMDINLINE Float SIMDCALL round_ps(Float const& a) -{ - return _mm256_round_ps(a, static_cast<int>(RMT)); -} - -static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a) -{ - return round_ps<RoundMode::CEIL_NOEXC>(a); -} -static SIMDINLINE Float SIMDCALL floor_ps(Float const& a) -{ - return round_ps<RoundMode::FLOOR_NOEXC>(a); -} - -//----------------------------------------------------------------------- -// Integer (various width) arithmetic operations -//----------------------------------------------------------------------- -SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) -SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32) -SIMD_EMU_IWRAPPER_2(add_epi8); // return a + b (int8) -SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) -SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) -SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) -SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) -SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32) - -// return (a * b) & 0xFFFFFFFF -// -// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, -// and store the low 32 bits of the intermediate integers in dst. -SIMD_EMU_IWRAPPER_2(mullo_epi32); -SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32) -SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64) -SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) - -//----------------------------------------------------------------------- -// Logical operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) -SIMD_IFWRAPPER_2(and_si, _mm256_and_ps); // return a & b (int) -SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) -SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps); // return (~a) & b (int) -SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) -SIMD_IFWRAPPER_2(or_si, _mm256_or_ps); // return a | b (int) -SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) -SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps); // return a ^ b (int) - -//----------------------------------------------------------------------- -// Shift operations -//----------------------------------------------------------------------- -SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT - -static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const& vA, - Integer const& vCount) // return a << b (uint32) -{ - int32_t aHi, aLow, countHi, countLow; - __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); - __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); - __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); - __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); - - aHi = _mm_extract_epi32(vAHi, 0); - countHi = _mm_extract_epi32(vCountHi, 0); - aHi <<= countHi; - vAHi = _mm_insert_epi32(vAHi, aHi, 0); - - aLow = _mm_extract_epi32(vALow, 0); - countLow = _mm_extract_epi32(vCountLow, 0); - aLow <<= countLow; - vALow = _mm_insert_epi32(vALow, aLow, 0); - - aHi = _mm_extract_epi32(vAHi, 1); - countHi = _mm_extract_epi32(vCountHi, 1); - aHi <<= countHi; - vAHi = _mm_insert_epi32(vAHi, aHi, 1); - - aLow = _mm_extract_epi32(vALow, 1); - countLow = _mm_extract_epi32(vCountLow, 1); - aLow <<= countLow; - vALow = _mm_insert_epi32(vALow, aLow, 1); - - aHi = _mm_extract_epi32(vAHi, 2); - countHi = _mm_extract_epi32(vCountHi, 2); - aHi <<= countHi; - vAHi = _mm_insert_epi32(vAHi, aHi, 2); - - aLow = _mm_extract_epi32(vALow, 2); - countLow = _mm_extract_epi32(vCountLow, 2); - aLow <<= countLow; - vALow = _mm_insert_epi32(vALow, aLow, 2); - - aHi = _mm_extract_epi32(vAHi, 3); - countHi = _mm_extract_epi32(vCountHi, 3); - aHi <<= countHi; - vAHi = _mm_insert_epi32(vAHi, aHi, 3); - - aLow = _mm_extract_epi32(vALow, 3); - countLow = _mm_extract_epi32(vCountLow, 3); - aLow <<= countLow; - vALow = _mm_insert_epi32(vALow, aLow, 3); - - __m256i ret = _mm256_set1_epi32(0); - ret = _mm256_insertf128_si256(ret, vAHi, 1); - ret = _mm256_insertf128_si256(ret, vALow, 0); - return ret; -} - -SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) -SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) -SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint) - -template <int ImmT> // same as srli_si, but with Float cast to int -static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a) -{ - return castsi_ps(srli_si<ImmT>(castps_si(a))); -} - -static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const& vA, - Integer const& vCount) // return a >> b (uint32) -{ - int32_t aHi, aLow, countHi, countLow; - __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); - __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); - __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); - __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); - - aHi = _mm_extract_epi32(vAHi, 0); - countHi = _mm_extract_epi32(vCountHi, 0); - aHi >>= countHi; - vAHi = _mm_insert_epi32(vAHi, aHi, 0); - - aLow = _mm_extract_epi32(vALow, 0); - countLow = _mm_extract_epi32(vCountLow, 0); - aLow >>= countLow; - vALow = _mm_insert_epi32(vALow, aLow, 0); - - aHi = _mm_extract_epi32(vAHi, 1); - countHi = _mm_extract_epi32(vCountHi, 1); - aHi >>= countHi; - vAHi = _mm_insert_epi32(vAHi, aHi, 1); - - aLow = _mm_extract_epi32(vALow, 1); - countLow = _mm_extract_epi32(vCountLow, 1); - aLow >>= countLow; - vALow = _mm_insert_epi32(vALow, aLow, 1); - - aHi = _mm_extract_epi32(vAHi, 2); - countHi = _mm_extract_epi32(vCountHi, 2); - aHi >>= countHi; - vAHi = _mm_insert_epi32(vAHi, aHi, 2); - - aLow = _mm_extract_epi32(vALow, 2); - countLow = _mm_extract_epi32(vCountLow, 2); - aLow >>= countLow; - vALow = _mm_insert_epi32(vALow, aLow, 2); - - aHi = _mm_extract_epi32(vAHi, 3); - countHi = _mm_extract_epi32(vCountHi, 3); - aHi >>= countHi; - vAHi = _mm_insert_epi32(vAHi, aHi, 3); - - aLow = _mm_extract_epi32(vALow, 3); - countLow = _mm_extract_epi32(vCountLow, 3); - aLow >>= countLow; - vALow = _mm_insert_epi32(vALow, aLow, 3); - - __m256i ret = _mm256_set1_epi32(0); - ret = _mm256_insertf128_si256(ret, vAHi, 1); - ret = _mm256_insertf128_si256(ret, vALow, 0); - return ret; -} - -//----------------------------------------------------------------------- -// Conversion operations -//----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a) -{ - return _mm256_castpd_ps(a); -} - -static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a) -{ - return _mm256_castps_si256(a); -} - -static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a) -{ - return _mm256_castsi256_pd(a); -} - -static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a) -{ - return _mm256_castps_pd(a); -} - -static SIMDINLINE Integer SIMDCALL castpd_si(Double const& a) // return *(Integer*)(&a) -{ - return _mm256_castpd_si256(a); -} - -static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a) -{ - return _mm256_castsi256_ps(a); -} - -static SIMDINLINE Float SIMDCALL - cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float) -{ - return _mm256_cvtepi32_ps(a); -} - -SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8); // return (int16)a (uint8 --> int16) -SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4); // return (int32)a (uint8 --> int32) -SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a (uint16 --> int32) -SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64) -SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64) - -static SIMDINLINE Integer SIMDCALL - cvtps_epi32(Float const& a) // return (int32)a (float --> int32) -{ - return _mm256_cvtps_epi32(a); -} - -static SIMDINLINE Integer SIMDCALL - cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32) -{ - return _mm256_cvttps_epi32(a); -} - -//----------------------------------------------------------------------- -// Comparison operations -//----------------------------------------------------------------------- -template <CompareType CmpTypeT> -static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b -{ - return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT)); -} -static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::LT_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::GT_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::NEQ_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::EQ_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::GE_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::LE_OQ>(a, b); -} - -SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) -SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) -SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) -SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) -SIMD_EMU_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) -SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) -SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) -SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) -SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32) - -static SIMDINLINE bool SIMDCALL - testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float) -{ - return 0 != _mm256_testz_ps(a, b); -} - -static SIMDINLINE bool SIMDCALL - testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int) -{ - return 0 != _mm256_testz_si256(a, b); -} - -//----------------------------------------------------------------------- -// Blend / shuffle / permute operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) -SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32) -SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) - -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, - Integer const& b, - Float const& mask) // return mask ? b : a (int) -{ - return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); -} - -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, - Integer const& b, - Integer const& mask) // return mask ? b : a (int) -{ - return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); -} - -static SIMDINLINE Float SIMDCALL - broadcast_ss(float const* p) // return *p (all elements in vector get same value) -{ - return _mm256_broadcast_ss(p); -} - -SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 -SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 -SIMD_EMU_IWRAPPER_2( - packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -SIMD_EMU_IWRAPPER_2( - packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 - -template <int ImmT> -static SIMDINLINE Float SIMDCALL permute_ps(Float const& a) -{ - return _mm256_permute_ps(a, ImmT); -} - -static SIMDINLINE Integer SIMDCALL permute_epi32( - Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32) -{ - Integer result; - - // Ugly slow implementation - uint32_t const* pA = reinterpret_cast<uint32_t const*>(&a); - uint32_t const* pSwiz = reinterpret_cast<uint32_t const*>(&swiz); - uint32_t* pResult = reinterpret_cast<uint32_t*>(&result); - - for (uint32_t i = 0; i < SIMD_WIDTH; ++i) - { - pResult[i] = pA[0xF & pSwiz[i]]; - } - - return result; -} - -static SIMDINLINE Float SIMDCALL - permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float) -{ - Float result; - - // Ugly slow implementation - float const* pA = reinterpret_cast<float const*>(&a); - uint32_t const* pSwiz = reinterpret_cast<uint32_t const*>(&swiz); - float* pResult = reinterpret_cast<float*>(&result); - - for (uint32_t i = 0; i < SIMD_WIDTH; ++i) - { - pResult[i] = pA[0xF & pSwiz[i]]; - } - - return result; -} - -SIMD_WRAPPER_2I(permute2f128_ps); -SIMD_DWRAPPER_2I(permute2f128_pd); -SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256); - -SIMD_EMU_IWRAPPER_1I(shuffle_epi32); - -template <int ImmT> -static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b) -{ - return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); -} -SIMD_EMU_IWRAPPER_2(shuffle_epi8); -SIMD_DWRAPPER_2I(shuffle_pd); -SIMD_WRAPPER_2I(shuffle_ps); -SIMD_EMU_IWRAPPER_2(unpackhi_epi16); -SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps); -SIMD_EMU_IWRAPPER_2(unpackhi_epi64); -SIMD_EMU_IWRAPPER_2(unpackhi_epi8); -SIMD_DWRAPPER_2(unpackhi_pd); -SIMD_WRAPPER_2(unpackhi_ps); -SIMD_EMU_IWRAPPER_2(unpacklo_epi16); -SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps); -SIMD_EMU_IWRAPPER_2(unpacklo_epi64); -SIMD_EMU_IWRAPPER_2(unpacklo_epi8); -SIMD_DWRAPPER_2(unpacklo_pd); -SIMD_WRAPPER_2(unpacklo_ps); - -//----------------------------------------------------------------------- -// Load / store operations -//----------------------------------------------------------------------- -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) -{ - uint32_t* pOffsets = (uint32_t*)&idx; - Float vResult; - float* pResult = (float*)&vResult; - for (uint32_t i = 0; i < SIMD_WIDTH; ++i) - { - uint32_t offset = pOffsets[i]; - offset = offset * static_cast<uint32_t>(ScaleT); - pResult[i] = *(float const*)(((uint8_t const*)p + offset)); - } - - return vResult; -} - -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL -sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) -{ - return i32gather_ps<ScaleT>(p, idx); -} - -static SIMDINLINE Float SIMDCALL - load1_ps(float const* p) // return *p (broadcast 1 value to all elements) -{ - return broadcast_ss(p); -} - -static SIMDINLINE Float SIMDCALL - load_ps(float const* p) // return *p (loads SIMD width elements from memory) -{ - return _mm256_load_ps(p); -} - -static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p -{ - return _mm256_load_si256(&p->v); -} - -static SIMDINLINE Float SIMDCALL - loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) -{ - return _mm256_loadu_ps(p); -} - -static SIMDINLINE Integer SIMDCALL - loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) -{ - return _mm256_lddqu_si256(&p->v); -} - -// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) -{ - uint32_t* pOffsets = (uint32_t*)&idx; - Float vResult = old; - float* pResult = (float*)&vResult; - unsigned long index = 0; - uint32_t umask = movemask_ps(mask); - while (_BitScanForward(&index, umask)) - { - umask &= ~(1 << index); - uint32_t offset = pOffsets[index]; - offset = offset * static_cast<uint32_t>(ScaleT); - pResult[index] = *(float const*)(((uint8_t const*)p + offset)); - } - - return vResult; -} - -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL -sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) -{ - return mask_i32gather_ps<ScaleT>(old, p, idx, mask); -} - -static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src) -{ - _mm256_maskstore_ps(p, mask, src); -} - -static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a) -{ - return SIMD128T::movemask_epi8(a.v4[0]) | (SIMD128T::movemask_epi8(a.v4[1]) << 16); -} - -static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a) -{ - return static_cast<uint32_t>(_mm256_movemask_pd(a)); -} -static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a) -{ - return static_cast<uint32_t>(_mm256_movemask_ps(a)); -} - -static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) -{ - return _mm256_set1_epi32(i); -} - -static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) -{ - return _mm256_set1_epi8(i); -} - -static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) -{ - return _mm256_set1_ps(f); -} - -static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) -{ - return _mm256_setzero_ps(); -} - -static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) -{ - return _mm256_setzero_si256(); -} - -static SIMDINLINE void SIMDCALL - store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory) -{ - _mm256_store_ps(p, a); -} - -static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a -{ - _mm256_store_si256(&p->v, a); -} - -static SIMDINLINE void SIMDCALL - stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache) -{ - _mm256_stream_ps(p, a); -} - -//======================================================================= -// Legacy interface (available only in SIMD256 width) -//======================================================================= - -static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const* p) -{ - return _mm256_broadcast_ps(&p->v); -} - -template <int ImmT> -static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const& a) -{ - return _mm256_extractf128_pd(a, ImmT); -} - -template <int ImmT> -static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const& a) -{ - return _mm256_extractf128_ps(a, ImmT); -} - -template <int ImmT> -static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const& a) -{ - return _mm256_extractf128_si256(a, ImmT); -} - -template <int ImmT> -static SIMDINLINE Double SIMDCALL insertf128_pd(Double const& a, SIMD128Impl::Double const& b) -{ - return _mm256_insertf128_pd(a, b, ImmT); -} - -template <int ImmT> -static SIMDINLINE Float SIMDCALL insertf128_ps(Float const& a, SIMD128Impl::Float const& b) -{ - return _mm256_insertf128_ps(a, b, ImmT); -} - -template <int ImmT> -static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const& a, SIMD128Impl::Integer const& b) -{ - return _mm256_insertf128_si256(a, b, ImmT); -} - -#ifndef _mm256_set_m128i -#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \ - _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1) -#endif - -#ifndef _mm256_loadu2_m128i -#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \ - /* SIMD128Impl::Integer const* */ loaddr) \ - _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)) -#endif - -static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, - SIMD128Impl::Integer const* plo) -{ - return _mm256_loadu2_m128i(&phi->v, &plo->v); -} - -static SIMDINLINE Integer SIMDCALL - set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) -{ - return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0); -} - -static SIMDINLINE Float SIMDCALL - set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) -{ - return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0); -} - -static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer* phi, - SIMD128Impl::Integer* plo, - Integer const& src) -{ - _mm256_storeu2_m128i(&phi->v, &plo->v, src); -} - -static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) -{ - Integer vec = set1_epi32(mask); - const Integer bit = set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - vec = and_si(vec, bit); - vec = cmplt_epi32(setzero_si(), vec); - return castsi_ps(vec); -} - -#undef SIMD_WRAPPER_1 -#undef SIMD_WRAPPER_2 -#undef SIMD_DWRAPPER_2 -#undef SIMD_DWRAPPER_2I -#undef SIMD_WRAPPER_2I -#undef SIMD_WRAPPER_3 -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_2 -#undef SIMD_IFWRAPPER_2 -#undef SIMD_IFWRAPPER_2I -#undef SIMD_IWRAPPER_2I -#undef SIMD_IWRAPPER_2I_ -#undef SIMD_IWRAPPER_2_ -#undef SIMD_IWRAPPER_3 -#undef SIMD_EMU_IWRAPPER_1 -#undef SIMD_EMU_IWRAPPER_1I -#undef SIMD_EMU_IWRAPPER_2 -#undef SIMD_EMU_IWRAPPER_2I diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl deleted file mode 100644 index 8fce96dcea4..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl +++ /dev/null @@ -1,255 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX2_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD256 AVX (2) implementation -// -// Since this implementation inherits from the AVX (1) implementation, -// the only operations below ones that replace AVX (1) operations. -// Mostly these are integer operations that are no longer emulated with SSE -//============================================================================ - -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); } - -#define SIMD_IWRAPPER_1L(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ - { \ - return _mm256_##op(_mm256_castsi256_si128(a)); \ - } - -#define SIMD_IWRAPPER_1I(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ - { \ - return _mm256_##op(a, ImmT); \ - } - -#define SIMD_IWRAPPER_1I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ - { \ - return _mm256_##intrin(a, ImmT); \ - } - -#define SIMD_IWRAPPER_2_(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return _mm256_##intrin(a, b); \ - } - -#define SIMD_IWRAPPER_2(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return _mm256_##op(a, b); \ - } - -#define SIMD_IWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return _mm256_##op(a, b, ImmT); \ - } - -#define SIMD_IWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return _mm256_##op(a, b, ImmT); \ - } - - -//----------------------------------------------------------------------- -// Floating point arithmetic operations -//----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a, - Float const& b, - Float const& c) // return (a * b) + c -{ - return _mm256_fmadd_ps(a, b, c); -} - -//----------------------------------------------------------------------- -// Integer (various width) arithmetic operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) -SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) -SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) -SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) -SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) -SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) -SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) - -// return (a * b) & 0xFFFFFFFF -// -// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, -// and store the low 32 bits of the intermediate integers in dst. -SIMD_IWRAPPER_2(mullo_epi32); -SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) -SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) -SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) - -//----------------------------------------------------------------------- -// Logical operations -//----------------------------------------------------------------------- -#if _MSC_VER >= 1920 // && _MSC_FULL_VER < [some_fixed_version] -// Some versions of MSVC 2019 don't handle constant folding of and_si() correctly. -// Using and_ps instead inhibits the compiler's constant folding and actually issues -// the and intrinsic even though both inputs are constant values. -#else -// Use native integer and intrinsic -SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int) -#endif -SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int) -SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int) -SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int) - -//----------------------------------------------------------------------- -// Shift operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT -SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32) -SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) -SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) -SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32) -SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint) - -template <int ImmT> // same as srli_si, but with Float cast to int -static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a) -{ - return castsi_ps(srli_si<ImmT>(castps_si(a))); -} - -//----------------------------------------------------------------------- -// Conversion operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1L(cvtepu8_epi16); // return (int16)a (uint8 --> int16) -SIMD_IWRAPPER_1L(cvtepu8_epi32); // return (int32)a (uint8 --> int32) -SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a (uint16 --> int32) -SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a (uint16 --> int64) -SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a (uint32 --> int64) - -//----------------------------------------------------------------------- -// Comparison operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) -SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) -SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) -SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) -SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) -SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) -SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) -SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) - -static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const& a, - Integer const& b) // return a < b (int32) -{ - return cmpgt_epi32(b, a); -} - -//----------------------------------------------------------------------- -// Blend / shuffle / permute operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32) -SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 -SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 -SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 - -template <int ImmT> -static SIMDINLINE Float SIMDCALL permute_ps(Float const& a) -{ - return _mm256_permute_ps(a, ImmT); -} - -SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32); - -static SIMDINLINE Float SIMDCALL - permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float) -{ - return _mm256_permutevar8x32_ps(a, swiz); -} - -SIMD_IWRAPPER_1I(shuffle_epi32); -template <int ImmT> -static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b) -{ - return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); -} -SIMD_IWRAPPER_2(shuffle_epi8); -SIMD_IWRAPPER_2(unpackhi_epi16); -SIMD_IWRAPPER_2(unpackhi_epi32); -SIMD_IWRAPPER_2(unpackhi_epi64); -SIMD_IWRAPPER_2(unpackhi_epi8); -SIMD_IWRAPPER_2(unpacklo_epi16); -SIMD_IWRAPPER_2(unpacklo_epi32); -SIMD_IWRAPPER_2(unpacklo_epi64); -SIMD_IWRAPPER_2(unpacklo_epi8); - -//----------------------------------------------------------------------- -// Load / store operations -//----------------------------------------------------------------------- -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) -{ - return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT)); -} - -#if _MSC_VER == 1920 // && _MSC_FULL_VER < [some_fixed_version] -// Don't use _mm256_mask_i32gather_ps(), the compiler doesn't preserve the mask register -// correctly in early versions of MSVC 2019 -#else -// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) -{ - // g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256() - // Only for this intrinsic - not sure why. :( - return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT)); -} -#endif - -static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a) -{ - return static_cast<uint32_t>(_mm256_movemask_epi8(a)); -} - -//======================================================================= -// Legacy interface (available only in SIMD256 width) -//======================================================================= - -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_1L -#undef SIMD_IWRAPPER_1I -#undef SIMD_IWRAPPER_1I_ -#undef SIMD_IWRAPPER_2_ -#undef SIMD_IWRAPPER_2 -#undef SIMD_IWRAPPER_2I -#undef SIMD_IWRAPPER_2I diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl deleted file mode 100644 index 4c883b11a25..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl +++ /dev/null @@ -1,349 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD256 AVX (512) implementation -// -// Since this implementation inherits from the AVX (2) implementation, -// the only operations below ones that replace AVX (2) operations. -// These use native AVX512 instructions with masking to enable a larger -// register set. -//============================================================================ - -private: -static SIMDINLINE __m512 __conv(Float r) -{ - return _mm512_castps256_ps512(r.v); -} -static SIMDINLINE __m512d __conv(Double r) -{ - return _mm512_castpd256_pd512(r.v); -} -static SIMDINLINE __m512i __conv(Integer r) -{ - return _mm512_castsi256_si512(r.v); -} -static SIMDINLINE Float __conv(__m512 r) -{ - return _mm512_castps512_ps256(r); -} -static SIMDINLINE Double __conv(__m512d r) -{ - return _mm512_castpd512_pd256(r); -} -static SIMDINLINE Integer __conv(__m512i r) -{ - return _mm512_castsi512_si256(r); -} - -public: -#define SIMD_WRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ - } -#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff)) - -#define SIMD_WRAPPER_1I_(op, intrin, mask) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ - } -#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff)) - -#define SIMD_WRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ - } -#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff)) - -#define SIMD_WRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \ - } - -#define SIMD_WRAPPER_3_(op, intrin, mask) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \ - } -#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff)) - -#define SIMD_DWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - { \ - return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \ - } - -#define SIMD_IWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ - } -#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff)) - -#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ - } -#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff)) - -#define SIMD_IWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ - } -#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff)) - -#define SIMD_IWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \ - } - -//----------------------------------------------------------------------- -// Single precision floating point arithmetic operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b -SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c -SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b -SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xff)); // return 1.0f / a -SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xff)); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b - -//----------------------------------------------------------------------- -// Integer (various width) arithmetic operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32) -SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32) -SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32) -SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32) -SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32) -SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32) -SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32) - -// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8) -// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) - -// return (a * b) & 0xFFFFFFFF -// -// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, -// and store the low 32 bits of the intermediate integers in dst. -SIMD_IWRAPPER_2_32(mullo_epi32); -SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32) - -// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64) -// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) - -//----------------------------------------------------------------------- -// Logical operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff)); // return a & b (int) -SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b (int) -SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff)); // return a | b (int) -SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff)); // return a ^ b (int) - -//----------------------------------------------------------------------- -// Shift operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT -SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32) -SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32) -SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32) -SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32) - -// use AVX2 version -// SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint) - -//----------------------------------------------------------------------- -// Conversion operations (Use AVX2 versions) -//----------------------------------------------------------------------- -// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16) -// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32) -// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32) -// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64) -// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64) - -//----------------------------------------------------------------------- -// Comparison operations (Use AVX2 versions -//----------------------------------------------------------------------- -// SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8) -// SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16) -// SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32) -// SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64) -// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8) -// SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16) -// SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32) -// SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64) -// -// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32) -//{ -// return cmpgt_epi32(b, a); -//} - -//----------------------------------------------------------------------- -// Blend / shuffle / permute operations -//----------------------------------------------------------------------- -// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 -// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation -// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> -// uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for -// _mm256_packus_epi32 and _mm512_packus_epi32 - -// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32); - -// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for -// each 32-bit lane i (float) -//{ -// return _mm256_permutevar8x32_ps(a, swiz); -//} - -SIMD_IWRAPPER_1I_32(shuffle_epi32); -// template<int ImmT> -// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) -//{ -// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); -//} -// SIMD_IWRAPPER_2(shuffle_epi8); -SIMD_IWRAPPER_2_32(unpackhi_epi32); -SIMD_IWRAPPER_2_32(unpacklo_epi32); - -// SIMD_IWRAPPER_2_16(unpackhi_epi16); -// SIMD_IWRAPPER_2_64(unpackhi_epi64); -// SIMD_IWRAPPER_2_8(unpackhi_epi8); -// SIMD_IWRAPPER_2_16(unpacklo_epi16); -// SIMD_IWRAPPER_2_64(unpacklo_epi64); -// SIMD_IWRAPPER_2_8(unpacklo_epi8); - -//----------------------------------------------------------------------- -// Load / store operations -//----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL - load_ps(float const* p) // return *p (loads SIMD width elements from memory) -{ - return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p)); -} - -static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p -{ - return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p)); -} - -static SIMDINLINE Float SIMDCALL - loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) -{ - return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p)); -} - -static SIMDINLINE Integer SIMDCALL - loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) -{ - return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p)); -} - -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) -{ - return __conv(_mm512_mask_i32gather_ps( - _mm512_setzero_ps(), __mmask16(0xff), __conv(idx), p, static_cast<int>(ScaleT))); -} - -// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) -{ - __mmask16 m = 0xff; - m = _mm512_mask_test_epi32_mask( - m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000)); - return __conv( - _mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT))); -} - -// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) -// { -// __mmask64 m = 0xffffffffull; -// return static_cast<uint32_t>( -// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80))); -// } - -static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src) -{ - __mmask16 m = 0xff; - m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000)); - _mm512_mask_storeu_ps(p, m, __conv(src)); -} - -static SIMDINLINE void SIMDCALL - store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory) -{ - _mm512_mask_storeu_ps(p, __mmask16(0xff), __conv(a)); -} - -static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a -{ - _mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a)); -} - -static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) -{ - return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xff), -1))); -} - -//======================================================================= -// Legacy interface (available only in SIMD256 width) -//======================================================================= - -#undef SIMD_WRAPPER_1_ -#undef SIMD_WRAPPER_1 -#undef SIMD_WRAPPER_1I_ -#undef SIMD_WRAPPER_1I -#undef SIMD_WRAPPER_2_ -#undef SIMD_WRAPPER_2 -#undef SIMD_WRAPPER_2I -#undef SIMD_WRAPPER_3_ -#undef SIMD_WRAPPER_3 -#undef SIMD_IWRAPPER_1_ -#undef SIMD_IWRAPPER_1_32 -#undef SIMD_IWRAPPER_1I_ -#undef SIMD_IWRAPPER_1I_32 -#undef SIMD_IWRAPPER_2_ -#undef SIMD_IWRAPPER_2_32 -#undef SIMD_IWRAPPER_2I diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl deleted file mode 100644 index 1acdc7e07ff..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl +++ /dev/null @@ -1,129 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD256 AVX (512) implementation for Core processors -// -// Since this implementation inherits from the AVX (2) implementation, -// the only operations below ones that replace AVX (2) operations. -// These use native AVX512 instructions with masking to enable a larger -// register set. -//============================================================================ - -#define SIMD_DWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Double SIMDCALL op(Double a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ - } -#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf)) - -#define SIMD_DWRAPPER_1I_(op, intrin, mask) \ - template <int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ - } -#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf)) - -#define SIMD_DWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ - } -#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf)) - -#define SIMD_IWRAPPER_1_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \ - } -#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull)) -#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff)) -#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf)) - -#define SIMD_IWRAPPER_1I_(op, intrin, mask) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \ - } -#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull)) -#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff)) -#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf)) - -#define SIMD_IWRAPPER_2_(op, intrin, mask) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \ - } -#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull)) -#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff)) -#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf)) - -SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64) -SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) -SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and - // _mm512_packs_epi16 -SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and - // _mm512_packs_epi32 -SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and - // _mm512_packus_epi16 -SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and - // _mm512_packus_epi32 -SIMD_IWRAPPER_2_16(unpackhi_epi16); -SIMD_IWRAPPER_2_64(unpackhi_epi64); -SIMD_IWRAPPER_2_8(unpackhi_epi8); -SIMD_IWRAPPER_2_16(unpacklo_epi16); -SIMD_IWRAPPER_2_64(unpacklo_epi64); -SIMD_IWRAPPER_2_8(unpacklo_epi8); - -static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) -{ - __mmask64 m = 0xffffffffull; - return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80))); -} - -#undef SIMD_DWRAPPER_1_ -#undef SIMD_DWRAPPER_1 -#undef SIMD_DWRAPPER_1I_ -#undef SIMD_DWRAPPER_1I -#undef SIMD_DWRAPPER_2_ -#undef SIMD_DWRAPPER_2 -#undef SIMD_DWRAPPER_2I -#undef SIMD_IWRAPPER_1_ -#undef SIMD_IWRAPPER_1_8 -#undef SIMD_IWRAPPER_1_16 -#undef SIMD_IWRAPPER_1_64 -#undef SIMD_IWRAPPER_1I_ -#undef SIMD_IWRAPPER_1I_8 -#undef SIMD_IWRAPPER_1I_16 -#undef SIMD_IWRAPPER_1I_64 -#undef SIMD_IWRAPPER_2_ -#undef SIMD_IWRAPPER_2_8 -#undef SIMD_IWRAPPER_2_16 -#undef SIMD_IWRAPPER_2_64 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl deleted file mode 100644 index 52b6ca2b61e..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl +++ /dev/null @@ -1,34 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD256 AVX (512) implementation for Knights Family -// -// Since this implementation inherits from the AVX (2) implementation, -// the only operations below ones that replace AVX (2) operations. -// These use native AVX512 instructions with masking to enable a larger -// register set. -//============================================================================ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl deleted file mode 100644 index 5053275e8d6..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl +++ /dev/null @@ -1,699 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) -// gcc as of 7.1 was missing these intrinsics -#ifndef _mm512_cmpneq_ps_mask -#define _mm512_cmpneq_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_NEQ_UQ) -#endif - -#ifndef _mm512_cmplt_ps_mask -#define _mm512_cmplt_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_LT_OS) -#endif - -#ifndef _mm512_cmplt_pd_mask -#define _mm512_cmplt_pd_mask(a, b) _mm512_cmp_pd_mask((a), (b), _CMP_LT_OS) -#endif - -#endif - -//============================================================================ -// SIMD16 AVX512 (F) implementation (compatible with Knights and Core -// processors) -// -//============================================================================ - -static const int TARGET_SIMD_WIDTH = 16; -using SIMD256T = SIMD256Impl::AVX2Impl; - -#define SIMD_WRAPPER_1_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); } - -#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op) - -#define SIMD_WRAPPER_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); } -#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op) - -#define SIMD_WRAPPERI_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return _mm512_castsi512_ps( \ - _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \ - } - -#define SIMD_DWRAPPER_2(op) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); } - -#define SIMD_WRAPPER_2I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return _mm512_##intrin(a, b, ImmT); \ - } -#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op) - -#define SIMD_DWRAPPER_2I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - { \ - return _mm512_##intrin(a, b, ImmT); \ - } -#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op) - -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); } - -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); } -#define SIMD_IWRAPPER_1_8(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); } - -#define SIMD_IWRAPPER_1_4(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); } - -#define SIMD_IWRAPPER_1I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return intrin(a, ImmT); \ - } -#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op) - -#define SIMD_IWRAPPER_2_(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); } -#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op) - -#define SIMD_IWRAPPER_2_CMP(op, cmp) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); } - -#define SIMD_IFWRAPPER_2(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \ - } - -#define SIMD_IWRAPPER_2I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return _mm512_##intrin(a, b, ImmT); \ - } -#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) - -private: -static SIMDINLINE Integer vmask(__mmask16 m) -{ - return _mm512_maskz_set1_epi32(m, -1); -} - -static SIMDINLINE Integer vmask(__mmask8 m) -{ - return _mm512_maskz_set1_epi64(m, -1LL); -} - -public: -//----------------------------------------------------------------------- -// Single precision floating point arithmetic operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b -SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c -SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b -SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps); // return 1.0f / a -SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b - -template <RoundMode RMT> -static SIMDINLINE Float SIMDCALL round_ps(Float a) -{ - return _mm512_roundscale_ps(a, static_cast<int>(RMT)); -} - -static SIMDINLINE Float SIMDCALL ceil_ps(Float a) -{ - return round_ps<RoundMode::CEIL_NOEXC>(a); -} -static SIMDINLINE Float SIMDCALL floor_ps(Float a) -{ - return round_ps<RoundMode::FLOOR_NOEXC>(a); -} - -//----------------------------------------------------------------------- -// Integer (various width) arithmetic operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) -SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) -// SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) -// SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) -SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) -SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) -SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) -SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) - -// return (a * b) & 0xFFFFFFFF -// -// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, -// and store the low 32 bits of the intermediate integers in dst. -SIMD_IWRAPPER_2(mullo_epi32); -SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) -SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) -// SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) - -//----------------------------------------------------------------------- -// Logical operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_2_(and_si, and_si512); // return a & b (int) -SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int) -SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int) -SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int) - -// SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) -// SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) -// SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) -// SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) - -//----------------------------------------------------------------------- -// Shift operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT -SIMD_IWRAPPER_2(sllv_epi32); -SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) -SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) - -#if 0 -SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint) - -template<int ImmT> // same as srli_si, but with Float cast to int -static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) -{ - return castsi_ps(srli_si<ImmT>(castps_si(a))); -} -#endif - -SIMD_IWRAPPER_2(srlv_epi32); - -//----------------------------------------------------------------------- -// Conversion operations -//----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a) -{ - return _mm512_castpd_ps(a); -} - -static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a) -{ - return _mm512_castps_si512(a); -} - -static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a) -{ - return _mm512_castsi512_pd(a); -} - -static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a) -{ - return _mm512_castps_pd(a); -} - -static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a) -{ - return _mm512_castpd_si512(a); -} - -static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a) -{ - return _mm512_castsi512_ps(a); -} - -static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float) -{ - return _mm512_cvtepi32_ps(a); -} - -// SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16) -SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32) -SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32) -SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64) -SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a (uint32 --> int64) - -static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32) -{ - return _mm512_cvtps_epi32(a); -} - -static SIMDINLINE Integer SIMDCALL - cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32) -{ - return _mm512_cvttps_epi32(a); -} - -//----------------------------------------------------------------------- -// Comparison operations -//----------------------------------------------------------------------- -template <CompareType CmpTypeT> -static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b) -{ - return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT)); -} - -template <CompareType CmpTypeT> -static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b -{ - // Legacy vector mask generator - __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b); - return castsi_ps(vmask(result)); -} - -static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) -{ - return cmp_ps<CompareType::LT_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) -{ - return cmp_ps<CompareType::GT_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) -{ - return cmp_ps<CompareType::NEQ_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) -{ - return cmp_ps<CompareType::EQ_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) -{ - return cmp_ps<CompareType::GE_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) -{ - return cmp_ps<CompareType::LE_OQ>(a, b); -} - -template <CompareTypeInt CmpTypeT> -static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b) -{ - // Legacy vector mask generator - __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT)); - return vmask(result); -} -template <CompareTypeInt CmpTypeT> -static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b) -{ - // Legacy vector mask generator - __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT)); - return vmask(result); -} - -// SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8) -// SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16) -SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32) -SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64) -// SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8) -// SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16) -SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32) -SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64) -SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32) - -static SIMDINLINE bool SIMDCALL testz_ps(Float a, - Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float) -{ - return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b)))); -} - -static SIMDINLINE bool SIMDCALL testz_si(Integer a, - Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int) -{ - return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b))); -} - -//----------------------------------------------------------------------- -// Blend / shuffle / permute operations -//----------------------------------------------------------------------- -template <int ImmT> -static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a (float) -{ - return _mm512_mask_blend_ps(__mmask16(ImmT), a, b); -} - -template <int ImmT> -static SIMDINLINE Integer blend_epi32(Integer a, Integer b) // return ImmT ? b : a (int32) -{ - return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b); -} - -static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a (float) -{ - return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b); -} - -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, - Integer b, - Float mask) // return mask ? b : a (int) -{ - return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); -} - -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, - Integer b, - Integer mask) // return mask ? b : a (int) -{ - return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); -} - -static SIMDINLINE Float SIMDCALL - broadcast_ss(float const* p) // return *p (all elements in vector get same value) -{ - return _mm512_set1_ps(*p); -} - -template <int imm> -static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a) -{ - return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm)); -} - -template <int imm> -static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a) -{ - return _mm512_extractf64x4_pd(a, imm); -} - -template <int imm> -static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a) -{ - return _mm512_extracti64x4_epi64(a, imm); -} - -template <int imm> -static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b) -{ - return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm)); -} - -template <int imm> -static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b) -{ - return _mm512_insertf64x4(a, b, imm); -} - -template <int imm> -static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b) -{ - return _mm512_inserti64x4(a, b, imm); -} - -// SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and -// _mm512_packs_epi16 SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 -// and _mm512_packs_epi32 SIMD_IWRAPPER_2(packus_epi16); // See documentation for -// _mm512_packus_epi16 and _mm512_packus_epi16 SIMD_IWRAPPER_2(packus_epi32); // See documentation -// for _mm512_packus_epi32 and _mm512_packus_epi32 - -template <int ImmT> -static SIMDINLINE Float SIMDCALL permute_ps(Float const& a) -{ - return _mm512_permute_ps(a, ImmT); -} - -static SIMDINLINE Integer SIMDCALL - permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) -{ - return _mm512_permutexvar_epi32(swiz, a); -} - -static SIMDINLINE Float SIMDCALL - permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) -{ - return _mm512_permutexvar_ps(swiz, a); -} - -SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4); -SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2); -SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4); - -SIMD_IWRAPPER_1I(shuffle_epi32); - -// SIMD_IWRAPPER_2(shuffle_epi8); -SIMD_DWRAPPER_2I(shuffle_pd); -SIMD_WRAPPER_2I(shuffle_ps); - -template <int ImmT> -static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) -{ - return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); -} - -SIMD_IWRAPPER_2(unpackhi_epi16); - -// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps); -static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b) -{ - return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b))); -} - -SIMD_IWRAPPER_2(unpackhi_epi64); -// SIMD_IWRAPPER_2(unpackhi_epi8); -SIMD_DWRAPPER_2(unpackhi_pd); -SIMD_WRAPPER_2(unpackhi_ps); -// SIMD_IWRAPPER_2(unpacklo_epi16); -SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps); -SIMD_IWRAPPER_2(unpacklo_epi64); -// SIMD_IWRAPPER_2(unpacklo_epi8); -SIMD_DWRAPPER_2(unpacklo_pd); -SIMD_WRAPPER_2(unpacklo_ps); - -//----------------------------------------------------------------------- -// Load / store operations -//----------------------------------------------------------------------- -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) -{ - return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT)); -} - -static SIMDINLINE Float SIMDCALL - load1_ps(float const* p) // return *p (broadcast 1 value to all elements) -{ - return broadcast_ss(p); -} - -static SIMDINLINE Float SIMDCALL - load_ps(float const* p) // return *p (loads SIMD width elements from memory) -{ - return _mm512_load_ps(p); -} - -static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p -{ - return _mm512_load_si512(&p->v); -} - -static SIMDINLINE Float SIMDCALL - loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) -{ - return _mm512_loadu_ps(p); -} - -static SIMDINLINE Integer SIMDCALL - loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) -{ - return _mm512_loadu_si512(p); -} - -// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) -{ - __mmask16 k = _mm512_test_epi32_mask(castps_si(mask), set1_epi32(0x80000000)); - - return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT)); -} - -static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src) -{ - Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si()); - _mm512_mask_store_ps(p, m, src); -} - -// static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) -//{ -// __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si()); -// return static_cast<uint64_t>(m); -//} - -static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a) -{ - __mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi64(0x8000000000000000LL)); - return static_cast<uint32_t>(m); -} -static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a) -{ - __mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x80000000)); - return static_cast<uint32_t>(m); -} - -static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all elements are same value) -{ - return _mm512_set1_epi64(i); -} - -static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) -{ - return _mm512_set1_epi32(i); -} - -static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) -{ - return _mm512_set1_epi8(i); -} - -static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) -{ - return _mm512_set1_ps(f); -} - -static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double) -{ - return _mm512_setzero_pd(); -} - -static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) -{ - return _mm512_setzero_ps(); -} - -static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) -{ - return _mm512_setzero_si512(); -} - -static SIMDINLINE void SIMDCALL - store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory) -{ - _mm512_store_ps(p, a); -} - -static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a -{ - _mm512_store_si512(&p->v, a); -} - -static SIMDINLINE void SIMDCALL - storeu_si(Integer* p, Integer a) // *p = a (same as store_si but allows for unaligned mem) -{ - _mm512_storeu_si512(&p->v, a); -} - -static SIMDINLINE void SIMDCALL - stream_ps(float* p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache) -{ - _mm512_stream_ps(p, a); -} - -static SIMDINLINE Integer SIMDCALL set_epi32(int i15, - int i14, - int i13, - int i12, - int i11, - int i10, - int i9, - int i8, - int i7, - int i6, - int i5, - int i4, - int i3, - int i2, - int i1, - int i0) -{ - return _mm512_set_epi32(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0); -} - -static SIMDINLINE Integer SIMDCALL - set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) -{ - return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0); -} - -static SIMDINLINE Float SIMDCALL set_ps(float i15, - float i14, - float i13, - float i12, - float i11, - float i10, - float i9, - float i8, - float i7, - float i6, - float i5, - float i4, - float i3, - float i2, - float i1, - float i0) -{ - return _mm512_set_ps(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0); -} - -static SIMDINLINE Float SIMDCALL - set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) -{ - return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0); -} - -static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) -{ - return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1))); -} - -#undef SIMD_WRAPPER_1_ -#undef SIMD_WRAPPER_1 -#undef SIMD_WRAPPER_2 -#undef SIMD_WRAPPER_2_ -#undef SIMD_WRAPPERI_2_ -#undef SIMD_DWRAPPER_2 -#undef SIMD_DWRAPPER_2I -#undef SIMD_WRAPPER_2I_ -#undef SIMD_WRAPPER_3_ -#undef SIMD_WRAPPER_2I -#undef SIMD_WRAPPER_3 -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_2 -#undef SIMD_IFWRAPPER_2 -#undef SIMD_IWRAPPER_2I -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_1I -#undef SIMD_IWRAPPER_1I_ -#undef SIMD_IWRAPPER_2 -#undef SIMD_IWRAPPER_2_ -#undef SIMD_IWRAPPER_2I diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl deleted file mode 100644 index 82aa2bb4173..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl +++ /dev/null @@ -1,186 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD16 AVX512 (F) implementation for Core processors -// -//============================================================================ - -#define SIMD_WRAPPER_1_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); } - -#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op) - -#define SIMD_WRAPPER_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); } -#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op) - -#define SIMD_WRAPPERI_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return _mm512_castsi512_ps( \ - _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \ - } - -#define SIMD_DWRAPPER_2(op) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); } - -#define SIMD_WRAPPER_2I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return _mm512_##intrin(a, b, ImmT); \ - } -#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op) - -#define SIMD_DWRAPPER_2I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - { \ - return _mm512_##intrin(a, b, ImmT); \ - } -#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op) - -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); } - -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); } -#define SIMD_IWRAPPER_1_8(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); } - -#define SIMD_IWRAPPER_1_4(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); } - -#define SIMD_IWRAPPER_1I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return intrin(a, ImmT); \ - } -#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op) - -#define SIMD_IWRAPPER_2_(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); } -#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op) - -#define SIMD_IWRAPPER_2_CMP(op, cmp) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); } - -#define SIMD_IFWRAPPER_2(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \ - } - -#define SIMD_IWRAPPER_2I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return _mm512_##intrin(a, b, ImmT); \ - } -#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) - -private: -static SIMDINLINE Integer vmask(__mmask32 m) -{ - return _mm512_maskz_set1_epi16(m, -1); -} -static SIMDINLINE Integer vmask(__mmask64 m) -{ - return _mm512_maskz_set1_epi8(m, -1); -} - -public: -SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) - -SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) -SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) -SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) -SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) - -SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16) - -template <CompareTypeInt CmpTypeT> -static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b) -{ - // Legacy vector mask generator - __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT)); - return vmask(result); -} -template <CompareTypeInt CmpTypeT> -static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b) -{ - // Legacy vector mask generator - __mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT)); - return vmask(result); -} - -SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8) -SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16) -SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8) -SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16) - -SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 -SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 -SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 -SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 - -SIMD_IWRAPPER_2(unpackhi_epi8); // See documentation for _mm512_unpackhi_epi8 -SIMD_IWRAPPER_2(unpacklo_epi16); // See documentation for _mm512_unpacklo_epi16 -SIMD_IWRAPPER_2(unpacklo_epi8); // See documentation for _mm512_unpacklo_epi8 - -SIMD_IWRAPPER_2(shuffle_epi8); - -static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) -{ - __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si()); - return static_cast<uint64_t>(m); -} - -#undef SIMD_WRAPPER_1_ -#undef SIMD_WRAPPER_1 -#undef SIMD_WRAPPER_2 -#undef SIMD_WRAPPER_2_ -#undef SIMD_WRAPPERI_2_ -#undef SIMD_DWRAPPER_2 -#undef SIMD_DWRAPPER_2I -#undef SIMD_WRAPPER_2I_ -#undef SIMD_WRAPPER_3_ -#undef SIMD_WRAPPER_2I -#undef SIMD_WRAPPER_3 -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_2 -#undef SIMD_IFWRAPPER_2 -#undef SIMD_IWRAPPER_2I -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_1I -#undef SIMD_IWRAPPER_1I_ -#undef SIMD_IWRAPPER_2 -#undef SIMD_IWRAPPER_2_ -#undef SIMD_IWRAPPER_2I diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl deleted file mode 100644 index 9ec3ff6c6b1..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl +++ /dev/null @@ -1,132 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD16 AVX512 (F) implementation for Knights Family Processors -// -//============================================================================ - -#define SIMD_WRAPPER_1_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); } - -#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op) - -#define SIMD_WRAPPER_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); } -#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op) - -#define SIMD_WRAPPERI_2_(op, intrin) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return _mm512_castsi512_ps( \ - _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \ - } - -#define SIMD_DWRAPPER_2(op) \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); } - -#define SIMD_WRAPPER_2I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ - { \ - return _mm512_##intrin(a, b, ImmT); \ - } -#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op) - -#define SIMD_DWRAPPER_2I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ - { \ - return _mm512_##intrin(a, b, ImmT); \ - } -#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op) - -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); } - -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); } -#define SIMD_IWRAPPER_1_8(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); } - -#define SIMD_IWRAPPER_1_4(op) \ - static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); } - -#define SIMD_IWRAPPER_1I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a) \ - { \ - return intrin(a, ImmT); \ - } -#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op) - -#define SIMD_IWRAPPER_2_(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); } -#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op) - -#define SIMD_IWRAPPER_2_CMP(op, cmp) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); } - -#define SIMD_IFWRAPPER_2(op, intrin) \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \ - } - -#define SIMD_IWRAPPER_2I_(op, intrin) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ - { \ - return _mm512_##intrin(a, b, ImmT); \ - } -#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) - -SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int) -SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int) -SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int) -SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int) - -#undef SIMD_WRAPPER_1_ -#undef SIMD_WRAPPER_1 -#undef SIMD_WRAPPER_2 -#undef SIMD_WRAPPER_2_ -#undef SIMD_WRAPPERI_2_ -#undef SIMD_DWRAPPER_2 -#undef SIMD_DWRAPPER_2I -#undef SIMD_WRAPPER_2I_ -#undef SIMD_WRAPPER_3_ -#undef SIMD_WRAPPER_2I -#undef SIMD_WRAPPER_3 -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_2 -#undef SIMD_IFWRAPPER_2 -#undef SIMD_IWRAPPER_2I -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_1I -#undef SIMD_IWRAPPER_1I_ -#undef SIMD_IWRAPPER_2 -#undef SIMD_IWRAPPER_2_ -#undef SIMD_IWRAPPER_2I diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl deleted file mode 100644 index f9d4b8c3902..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl +++ /dev/null @@ -1,27 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -// Implement mask-enabled SIMD functions diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl deleted file mode 100644 index f9d4b8c3902..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl +++ /dev/null @@ -1,27 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -// Implement mask-enabled SIMD functions diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl deleted file mode 100644 index f9d4b8c3902..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl +++ /dev/null @@ -1,27 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX512_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -// Implement mask-enabled SIMD functions diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl deleted file mode 100644 index ec905505dc4..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl +++ /dev/null @@ -1,852 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -//============================================================================ -// SIMD16 AVX (1) implementation -//============================================================================ - -static const int TARGET_SIMD_WIDTH = 8; -using SIMD128T = SIMD128Impl::AVXImpl; - -#define SIMD_WRAPPER_1(op) \ - static SIMDINLINE Float SIMDCALL op(Float const& a) \ - { \ - return Float{ \ - SIMD256T::op(a.v8[0]), \ - SIMD256T::op(a.v8[1]), \ - }; \ - } - -#define SIMD_WRAPPER_2(op) \ - static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ - { \ - return Float{ \ - SIMD256T::op(a.v8[0], b.v8[0]), \ - SIMD256T::op(a.v8[1], b.v8[1]), \ - }; \ - } - -#define SIMD_WRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ - { \ - return Float{ \ - SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \ - SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \ - }; \ - } - -#define SIMD_WRAPPER_2I_1(op) \ - template <int ImmT> \ - static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ - { \ - return Float{ \ - SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \ - SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \ - }; \ - } - -#define SIMD_WRAPPER_3(op) \ - static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \ - { \ - return Float{ \ - SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \ - SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \ - }; \ - } - -#define SIMD_IWRAPPER_1(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ - { \ - return Integer{ \ - SIMD256T::op(a.v8[0]), \ - SIMD256T::op(a.v8[1]), \ - }; \ - } - -#define SIMD_IWRAPPER_2(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return Integer{ \ - SIMD256T::op(a.v8[0], b.v8[0]), \ - SIMD256T::op(a.v8[1], b.v8[1]), \ - }; \ - } - -#define SIMD_IWRAPPER_2I(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return Integer{ \ - SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \ - SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \ - }; \ - } - -#define SIMD_IWRAPPER_2I_1(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return Integer{ \ - SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \ - SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \ - }; \ - } - -#define SIMD_IWRAPPER_2I_2(op) \ - template <int ImmT> \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ - { \ - return Integer{ \ - SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]), \ - SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]), \ - }; \ - } - -#define SIMD_IWRAPPER_3(op) \ - static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \ - { \ - return Integer{ \ - SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \ - SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \ - }; \ - } - -//----------------------------------------------------------------------- -// Single precision floating point arithmetic operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2(add_ps); // return a + b -SIMD_WRAPPER_2(div_ps); // return a / b -SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c -SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c -SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b -SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b -SIMD_WRAPPER_2(mul_ps); // return a * b -SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a -SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) -SIMD_WRAPPER_2(sub_ps); // return a - b - -template <RoundMode RMT> -static SIMDINLINE Float SIMDCALL round_ps(Float const& a) -{ - return Float{ - SIMD256T::template round_ps<RMT>(a.v8[0]), - SIMD256T::template round_ps<RMT>(a.v8[1]), - }; -} - -static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a) -{ - return round_ps<RoundMode::CEIL_NOEXC>(a); -} -static SIMDINLINE Float SIMDCALL floor_ps(Float const& a) -{ - return round_ps<RoundMode::FLOOR_NOEXC>(a); -} - -//----------------------------------------------------------------------- -// Integer (various width) arithmetic operations -//----------------------------------------------------------------------- -SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) -SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) -SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) -SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) -SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) -SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) -SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) -SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) -SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) - -// return (a * b) & 0xFFFFFFFF -// -// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, -// and store the low 32 bits of the intermediate integers in dst. -SIMD_IWRAPPER_2(mullo_epi32); -SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) -SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) -SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) - -//----------------------------------------------------------------------- -// Logical operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) -SIMD_IWRAPPER_2(and_si); // return a & b (int) -SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) -SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int) -SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) -SIMD_IWRAPPER_2(or_si); // return a | b (int) -SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) -SIMD_IWRAPPER_2(xor_si); // return a ^ b (int) - -//----------------------------------------------------------------------- -// Shift operations -//----------------------------------------------------------------------- -template <int ImmT> -static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT -{ - return Integer{ - SIMD256T::template slli_epi32<ImmT>(a.v8[0]), - SIMD256T::template slli_epi32<ImmT>(a.v8[1]), - }; -} - -SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32) - -template <int ImmT> -static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT (int32) -{ - return Integer{ - SIMD256T::template srai_epi32<ImmT>(a.v8[0]), - SIMD256T::template srai_epi32<ImmT>(a.v8[1]), - }; -} - -template <int ImmT> -static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT (uint32) -{ - return Integer{ - SIMD256T::template srli_epi32<ImmT>(a.v8[0]), - SIMD256T::template srli_epi32<ImmT>(a.v8[1]), - }; -} - -template <int ImmT> // for each 128-bit lane: -static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) // return a >> (ImmT*8) (uint) -{ - return Integer{ - SIMD256T::template srli_si<ImmT>(a.v8[0]), - SIMD256T::template srli_si<ImmT>(a.v8[1]), - }; -} -template <int ImmT> -static SIMDINLINE Float SIMDCALL - srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int -{ - return Float{ - SIMD256T::template srlisi_ps<ImmT>(a.v8[0]), - SIMD256T::template srlisi_ps<ImmT>(a.v8[1]), - }; -} - -SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32) - -//----------------------------------------------------------------------- -// Conversion operations -//----------------------------------------------------------------------- -static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a) -{ - return Float{ - SIMD256T::castpd_ps(a.v8[0]), - SIMD256T::castpd_ps(a.v8[1]), - }; -} - -static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a) -{ - return Integer{ - SIMD256T::castps_si(a.v8[0]), - SIMD256T::castps_si(a.v8[1]), - }; -} - -static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a) -{ - return Double{ - SIMD256T::castsi_pd(a.v8[0]), - SIMD256T::castsi_pd(a.v8[1]), - }; -} - -static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a) -{ - return Double{ - SIMD256T::castps_pd(a.v8[0]), - SIMD256T::castps_pd(a.v8[1]), - }; -} - -static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a) -{ - return Float{ - SIMD256T::castsi_ps(a.v8[0]), - SIMD256T::castsi_ps(a.v8[1]), - }; -} - -static SIMDINLINE Float SIMDCALL - cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float) -{ - return Float{ - SIMD256T::cvtepi32_ps(a.v8[0]), - SIMD256T::cvtepi32_ps(a.v8[1]), - }; -} - -static SIMDINLINE Integer SIMDCALL - cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a (uint8 --> int16) -{ - return Integer{ - SIMD256T::cvtepu8_epi16(a.v4[0]), - SIMD256T::cvtepu8_epi16(a.v4[1]), - }; -} - -static SIMDINLINE Integer SIMDCALL - cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint8 --> int32) -{ - return Integer{ - SIMD256T::cvtepu8_epi32(a.v4[0]), - SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])), - }; -} - -static SIMDINLINE Integer SIMDCALL - cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint16 --> int32) -{ - return Integer{ - SIMD256T::cvtepu16_epi32(a.v4[0]), - SIMD256T::cvtepu16_epi32(a.v4[1]), - }; -} - -static SIMDINLINE Integer SIMDCALL - cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint16 --> int64) -{ - return Integer{ - SIMD256T::cvtepu16_epi64(a.v4[0]), - SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])), - }; -} - -static SIMDINLINE Integer SIMDCALL - cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint32 --> int64) -{ - return Integer{ - SIMD256T::cvtepu32_epi64(a.v4[0]), - SIMD256T::cvtepu32_epi64(a.v4[1]), - }; -} - -static SIMDINLINE Integer SIMDCALL - cvtps_epi32(Float const& a) // return (int32)a (float --> int32) -{ - return Integer{ - SIMD256T::cvtps_epi32(a.v8[0]), - SIMD256T::cvtps_epi32(a.v8[1]), - }; -} - -static SIMDINLINE Integer SIMDCALL - cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32) -{ - return Integer{ - SIMD256T::cvtps_epi32(a.v8[0]), - SIMD256T::cvtps_epi32(a.v8[1]), - }; -} - -//----------------------------------------------------------------------- -// Comparison operations -//----------------------------------------------------------------------- -template <CompareType CmpTypeT> -static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b -{ - return Float{ - SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]), - SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]), - }; -} -static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::LT_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::GT_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::NEQ_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::EQ_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::GE_OQ>(a, b); -} -static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b) -{ - return cmp_ps<CompareType::LE_OQ>(a, b); -} - -template <CompareType CmpTypeT> -static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b) -{ - return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b))); -} - -SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) -SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) -SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) -SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) -SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) -SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) -SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) -SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) -SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) - -static SIMDINLINE bool SIMDCALL - testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float) -{ - return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1])); -} - -static SIMDINLINE bool SIMDCALL - testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int) -{ - return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1])); -} - -//----------------------------------------------------------------------- -// Blend / shuffle / permute operations -//----------------------------------------------------------------------- -SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) -SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32) -SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, - Integer const& b, - Float const& mask) // return mask ? b : a (int) -{ - return Integer{ - SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]), - SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]), - }; -} - -static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, - Integer const& b, - Integer const& mask) // return mask ? b : a (int) -{ - return Integer{ - SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]), - SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]), - }; -} - -static SIMDINLINE Float SIMDCALL - broadcast_ss(float const* p) // return *p (all elements in vector get same value) -{ - float f = *p; - return Float{ - SIMD256T::set1_ps(f), - SIMD256T::set1_ps(f), - }; -} - -template <int imm> -static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a) -{ - SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); - return a.v8[imm]; -} - -template <int imm> -static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a) -{ - SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); - return a.v8[imm]; -} - -template <int imm> -static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a) -{ - SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); - return a.v8[imm]; -} - -template <int imm> -static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b) -{ - SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); - Float r = a; - r.v8[imm] = b; - return r; -} - -template <int imm> -static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b) -{ - SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); - Double r = a; - r.v8[imm] = b; - return r; -} - -template <int imm> -static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b) -{ - SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); - Integer r = a; - r.v8[imm] = b; - return r; -} - -SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 -SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 -SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 -SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 - -template <int ImmT> -static SIMDINLINE Float SIMDCALL permute_ps(Float const& a) -{ - return Float{ - SIMD256T::template permute_ps<ImmT>(a.v8[0]), - SIMD256T::template permute_ps<ImmT>(a.v8[1]), - }; -} - -static SIMDINLINE Integer SIMDCALL permute_epi32( - Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32) -{ - return castps_si(permute_ps(castsi_ps(a), swiz)); -} - -static SIMDINLINE Float SIMDCALL - permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float) -{ - const auto mask = SIMD256T::set1_epi32(7); - - auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask)); - auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask)); - - auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask)); - auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask)); - - return Float{ - SIMD256T::blendv_ps( - lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))), - SIMD256T::blendv_ps( - hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))), - }; -} - -// All of the 512-bit permute2f128_XX intrinsics do the following: -// -// SELECT4(src, control) { -// CASE(control[1:0]) -// 0 : tmp[127:0] : = src[127:0] -// 1 : tmp[127:0] : = src[255:128] -// 2 : tmp[127:0] : = src[383:256] -// 3 : tmp[127:0] : = src[511:384] -// ESAC -// RETURN tmp[127:0] -// } -// -// dst[127:0] : = SELECT4(a[511:0], imm8[1:0]) -// dst[255:128] : = SELECT4(a[511:0], imm8[3:2]) -// dst[383:256] : = SELECT4(b[511:0], imm8[5:4]) -// dst[511:384] : = SELECT4(b[511:0], imm8[7:6]) -// dst[MAX:512] : = 0 -// -// Since the 256-bit AVX instructions use a 4-bit control field (instead -// of 2-bit for AVX512), we need to expand the control bits sent to the -// AVX instructions for emulation. -// -template <int shuf> -static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b) -{ - return Float{ - SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], - a.v8[1]), - SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], - b.v8[1]), - }; -} - -template <int shuf> -static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b) -{ - return Double{ - SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], - a.v8[1]), - SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], - b.v8[1]), - }; -} - -template <int shuf> -static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b) -{ - return Integer{ - SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], - a.v8[1]), - SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], - b.v8[1]), - }; -} - -SIMD_IWRAPPER_2I_1(shuffle_epi32); -SIMD_IWRAPPER_2I_2(shuffle_epi64); -SIMD_IWRAPPER_2(shuffle_epi8); -SIMD_WRAPPER_2I_1(shuffle_pd); -SIMD_WRAPPER_2I_1(shuffle_ps); -SIMD_IWRAPPER_2(unpackhi_epi16); -SIMD_IWRAPPER_2(unpackhi_epi32); -SIMD_IWRAPPER_2(unpackhi_epi64); -SIMD_IWRAPPER_2(unpackhi_epi8); -SIMD_WRAPPER_2(unpackhi_pd); -SIMD_WRAPPER_2(unpackhi_ps); -SIMD_IWRAPPER_2(unpacklo_epi16); -SIMD_IWRAPPER_2(unpacklo_epi32); -SIMD_IWRAPPER_2(unpacklo_epi64); -SIMD_IWRAPPER_2(unpacklo_epi8); -SIMD_WRAPPER_2(unpacklo_pd); -SIMD_WRAPPER_2(unpacklo_ps); - -//----------------------------------------------------------------------- -// Load / store operations -//----------------------------------------------------------------------- -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) -{ - return Float{ - SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]), - SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]), - }; -} - -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) -{ - return Float{ - SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]), - SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]), - }; -} - -static SIMDINLINE Float SIMDCALL - load1_ps(float const* p) // return *p (broadcast 1 value to all elements) -{ - return broadcast_ss(p); -} - -static SIMDINLINE Float SIMDCALL - load_ps(float const* p) // return *p (loads SIMD width elements from memory) -{ - return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)}; -} - -static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p -{ - return Integer{ - SIMD256T::load_si(&p->v8[0]), - SIMD256T::load_si(&p->v8[1]), - }; -} - -static SIMDINLINE Float SIMDCALL - loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) -{ - return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)}; -} - -static SIMDINLINE Integer SIMDCALL - loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) -{ - return Integer{ - SIMD256T::loadu_si(&p->v8[0]), - SIMD256T::loadu_si(&p->v8[1]), - }; -} - -// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) -{ - return Float{ - SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]), - SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]), - }; -} - -template <ScaleFactor ScaleT = ScaleFactor::SF_1> -static SIMDINLINE Float SIMDCALL - sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) -{ - return Float{ - SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]), - SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]), - }; -} - -static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src) -{ - SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]); - SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]); -} - -static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a) -{ - uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0])); - mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4); - - return mask; -} - -static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a) -{ - uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0])); - mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2); - - return mask; -} -static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a) -{ - uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0])); - mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH; - - return mask; -} - -static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) -{ - return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)}; -} - -static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) -{ - return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)}; -} - -static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) -{ - return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)}; -} - -static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) -{ - return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()}; -} - -static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) -{ - return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()}; -} - -static SIMDINLINE void SIMDCALL - store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory) -{ - SIMD256T::store_ps(p, a.v8[0]); - SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]); -} - -static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a -{ - SIMD256T::store_si(&p->v8[0], a.v8[0]); - SIMD256T::store_si(&p->v8[1], a.v8[1]); -} - -static SIMDINLINE void SIMDCALL - stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache) -{ - SIMD256T::stream_ps(p, a.v8[0]); - SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]); -} - -static SIMDINLINE Integer SIMDCALL set_epi32(int i15, - int i14, - int i13, - int i12, - int i11, - int i10, - int i9, - int i8, - int i7, - int i6, - int i5, - int i4, - int i3, - int i2, - int i1, - int i0) -{ - return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0), - SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)}; -} - -static SIMDINLINE Integer SIMDCALL - set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) -{ - return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0); -} - -static SIMDINLINE Float SIMDCALL set_ps(float i15, - float i14, - float i13, - float i12, - float i11, - float i10, - float i9, - float i8, - float i7, - float i6, - float i5, - float i4, - float i3, - float i2, - float i1, - float i0) -{ - return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0), - SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)}; -} - -static SIMDINLINE Float SIMDCALL - set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) -{ - return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0); -} - -static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) -{ - return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)}; -} - -#undef SIMD_WRAPPER_1 -#undef SIMD_WRAPPER_2 -#undef SIMD_WRAPPER_2I -#undef SIMD_WRAPPER_2I_1 -#undef SIMD_WRAPPER_3 -#undef SIMD_IWRAPPER_1 -#undef SIMD_IWRAPPER_2 -#undef SIMD_IWRAPPER_2I -#undef SIMD_IWRAPPER_2I_1 -#undef SIMD_IWRAPPER_3 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl deleted file mode 100644 index 473934824ee..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl +++ /dev/null @@ -1,27 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#if !defined(__SIMD_LIB_AVX_HPP__) -#error Do not include this file directly, use "simdlib.hpp" instead. -#endif - -// no backwards compatibility for simd mask-enabled functions diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp deleted file mode 100644 index 3d31b39ee55..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp +++ /dev/null @@ -1,332 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#pragma once -#if 0 -//=========================================================================== -// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures. -//=========================================================================== -struct SIMD256 // or SIMD4 or SIMD16 -{ - //======================================================================= - // SIMD Types - // - // These typedefs are examples. The SIMD256 and SIMD16 implementations will - // use different base types with this same naming. - using Float = __m256; // Packed single-precision float vector - using Double = __m256d; // Packed double-precision float vector - using Integer = __m256i; // Packed integer vector (mutable element widths) - using Mask = uint8_t; // Integer representing mask bits - - //======================================================================= - // Standard interface - // (available in both SIMD256 and SIMD16 widths) - //======================================================================= - - //----------------------------------------------------------------------- - // Single precision floating point arithmetic operations - //----------------------------------------------------------------------- - static Float add_ps(Float a, Float b); // return a + b - static Float div_ps(Float a, Float b); // return a / b - static Float fmadd_ps(Float a, Float b, Float c); // return (a * b) + c - static Float fmsub_ps(Float a, Float b, Float c); // return (a * b) - c - static Float max_ps(Float a, Float b); // return (a > b) ? a : b - static Float min_ps(Float a, Float b); // return (a < b) ? a : b - static Float mul_ps(Float a, Float b); // return a * b - static Float rcp_ps(Float a); // return 1.0f / a - static Float rsqrt_ps(Float a); // return 1.0f / sqrt(a) - static Float sub_ps(Float a, Float b); // return a - b - - enum class RoundMode - { - TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5) - TO_NEG_INF = 0x01, // Round to negative infinity - TO_POS_INF = 0x02, // Round to positive infinity - TO_ZERO = 0x03, // Round to 0 a.k.a. truncate - CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register - - RAISE_EXC = 0x00, // Raise exception on overflow - NO_EXC = 0x08, // Suppress exceptions - - NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC), - NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC), - FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC), - FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC), - CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC), - CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC), - TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC), - TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC), - RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC), - NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC), - }; - - // return round_func(a) - // - // round_func is chosen on the RMT template parameter. See the documentation - // for the RoundMode enumeration above. - template <RoundMode RMT> - static Float round_ps(Float a); // return round(a) - - - //----------------------------------------------------------------------- - // Integer (various width) arithmetic operations - //----------------------------------------------------------------------- - static Integer abs_epi32(Integer a); // return absolute_value(a) (int32) - static Integer add_epi32(Integer a, Integer b); // return a + b (int32) - static Integer add_epi8(Integer a, Integer b); // return a + b (int8) - static Integer adds_epu8(Integer a, Integer b); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) - static Integer max_epi32(Integer a, Integer b); // return (a > b) ? a : b (int32) - static Integer max_epu32(Integer a, Integer b); // return (a > b) ? a : b (uint32) - static Integer min_epi32(Integer a, Integer b); // return (a < b) ? a : b (int32) - static Integer min_epu32(Integer a, Integer b); // return (a < b) ? a : b (uint32) - static Integer mul_epi32(Integer a, Integer b); // return a * b (int32) - - // return (a * b) & 0xFFFFFFFF - // - // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, - // and store the low 32 bits of the intermediate integers in dst. - static Float mullo_epi32(Integer a, Integer b); - - static Integer sub_epi32(Integer a, Integer b); // return a - b (int32) - static Integer sub_epi64(Integer a, Integer b); // return a - b (int64) - static Integer subs_epu8(Integer a, Integer b); // return (b > a) ? 0 : (a - b) (uint8) - - //----------------------------------------------------------------------- - // Logical operations - //----------------------------------------------------------------------- - static Float and_ps(Float a, Float b); // return a & b (float treated as int) - static Integer and_si(Integer a, Integer b); // return a & b (int) - static Float andnot_ps(Float a, Float b); // return (~a) & b (float treated as int) - static Integer andnot_si(Integer a, Integer b); // return (~a) & b (int) - static Float or_ps(Float a, Float b); // return a | b (float treated as int) - static Float or_si(Integer a, Integer b); // return a | b (int) - static Float xor_ps(Float a, Float b); // return a ^ b (float treated as int) - static Integer xor_si(Integer a, Integer b); // return a ^ b (int) - - //----------------------------------------------------------------------- - // Shift operations - //----------------------------------------------------------------------- - template<int ImmT> - static Integer slli_epi32(Integer a); // return a << ImmT - static Integer sllv_epi32(Integer a, Integer b); // return a << b - template<int ImmT> - static Integer srai_epi32(Integer a); // return a >> ImmT (int32) - template<int ImmT> - static Integer srli_epi32(Integer a); // return a >> ImmT (uint32) - template<int ImmT> // for each 128-bit lane: - static Integer srli_si(Integer a); // return a >> (ImmT*8) (uint) - template<int ImmT> - static Float srlisi_ps(Float a); // same as srli_si, but with Float cast to int - static Integer srlv_epi32(Integer a, Integer b); // return a >> b (uint32) - - //----------------------------------------------------------------------- - // Conversion operations - //----------------------------------------------------------------------- - static Float castpd_ps(Double a); // return *(Float*)(&a) - static Integer castps_si(Float a); // return *(Integer*)(&a) - static Double castsi_pd(Integer a); // return *(Double*)(&a) - static Double castps_pd(Float a); // return *(Double*)(&a) - static Float castsi_ps(Integer a); // return *(Float*)(&a) - static Float cvtepi32_ps(Integer a); // return (float)a (int32 --> float) - static Integer cvtepu8_epi16(Integer a); // return (int16)a (uint8 --> int16) - static Integer cvtepu8_epi32(Integer a); // return (int32)a (uint8 --> int32) - static Integer cvtepu16_epi32(Integer a); // return (int32)a (uint16 --> int32) - static Integer cvtepu16_epi64(Integer a); // return (int64)a (uint16 --> int64) - static Integer cvtepu32_epi64(Integer a); // return (int64)a (uint32 --> int64) - static Integer cvtps_epi32(Float a); // return (int32)a (float --> int32) - static Integer cvttps_epi32(Float a); // return (int32)a (rnd_to_zero(float) --> int32) - - //----------------------------------------------------------------------- - // Comparison operations - //----------------------------------------------------------------------- - - // Comparison types used with cmp_ps: - // - ordered comparisons are always false if either operand is NaN - // - unordered comparisons are always true if either operand is NaN - // - signaling comparisons raise an exception if either operand is NaN - // - non-signaling comparisons will never raise an exception - // - // Ordered: return (a != NaN) && (b != NaN) && (a cmp b) - // Unordered: return (a == NaN) || (b == NaN) || (a cmp b) - enum class CompareType - { - EQ_OQ = 0x00, // Equal (ordered, nonsignaling) - LT_OS = 0x01, // Less-than (ordered, signaling) - LE_OS = 0x02, // Less-than-or-equal (ordered, signaling) - UNORD_Q = 0x03, // Unordered (nonsignaling) - NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling) - NLT_US = 0x05, // Not-less-than (unordered, signaling) - NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling) - ORD_Q = 0x07, // Ordered (nonsignaling) - EQ_UQ = 0x08, // Equal (unordered, non-signaling) - NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling) - NGT_US = 0x0A, // Not-greater-than (unordered, signaling) - FALSE_OQ = 0x0B, // False (ordered, nonsignaling) - NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling) - GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling) - GT_OS = 0x0E, // Greater-than (ordered, signaling) - TRUE_UQ = 0x0F, // True (unordered, non-signaling) - EQ_OS = 0x10, // Equal (ordered, signaling) - LT_OQ = 0x11, // Less-than (ordered, nonsignaling) - LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling) - UNORD_S = 0x13, // Unordered (signaling) - NEQ_US = 0x14, // Not-equal (unordered, signaling) - NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling) - NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling) - ORD_S = 0x17, // Ordered (signaling) - EQ_US = 0x18, // Equal (unordered, signaling) - NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling) - NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling) - FALSE_OS = 0x1B, // False (ordered, signaling) - NEQ_OS = 0x1C, // Not-equal (ordered, signaling) - GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling) - GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling) - TRUE_US = 0x1F, // True (unordered, signaling) - }; - - // return a (CmpTypeT) b (float) - // - // See documentation for CompareType above for valid values for CmpTypeT. - template<CompareType CmpTypeT> - static Float cmp_ps(Float a, Float b); // return a (CmtTypeT) b (see above) - static Float cmpgt_ps(Float a, Float b); // return cmp_ps<CompareType::GT_OQ>(a, b) - static Float cmple_ps(Float a, Float b); // return cmp_ps<CompareType::LE_OQ>(a, b) - static Float cmplt_ps(Float a, Float b); // return cmp_ps<CompareType::LT_OQ>(a, b) - static Float cmpneq_ps(Float a, Float b); // return cmp_ps<CompareType::NEQ_OQ>(a, b) - static Float cmpeq_ps(Float a, Float b); // return cmp_ps<CompareType::EQ_OQ>(a, b) - static Float cmpge_ps(Float a, Float b); // return cmp_ps<CompareType::GE_OQ>(a, b) - static Integer cmpeq_epi8(Integer a, Integer b); // return a == b (int8) - static Integer cmpeq_epi16(Integer a, Integer b); // return a == b (int16) - static Integer cmpeq_epi32(Integer a, Integer b); // return a == b (int32) - static Integer cmpeq_epi64(Integer a, Integer b); // return a == b (int64) - static Integer cmpgt_epi8(Integer a, Integer b); // return a > b (int8) - static Integer cmpgt_epi16(Integer a, Integer b); // return a > b (int16) - static Integer cmpgt_epi32(Integer a, Integer b); // return a > b (int32) - static Integer cmpgt_epi64(Integer a, Integer b); // return a > b (int64) - static Integer cmplt_epi32(Integer a, Integer b); // return a < b (int32) - static bool testz_ps(Float a, Float b); // return all_lanes_zero(a & b) ? 1 : 0 (float) - static bool testz_si(Integer a, Integer b); // return all_lanes_zero(a & b) ? 1 : 0 (int) - - //----------------------------------------------------------------------- - // Blend / shuffle / permute operations - //----------------------------------------------------------------------- - template<int ImmT> - static Float blend_ps(Float a, Float b); // return ImmT ? b : a (float) - static Integer blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int) - static Float blendv_ps(Float a, Float b, Float mask); // return mask ? b : a (float) - static Float broadcast_ss(float const *p); // return *p (all elements in vector get same value) - static Integer packs_epi16(Integer a, Integer b); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 - static Integer packs_epi32(Integer a, Integer b); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 - static Integer packus_epi16(Integer a, Integer b); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 - static Integer packus_epi32(Integer a, Integer b); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 - static Float permute_epi32(Integer a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (int32) - static Float permute_ps(Float a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (float) - template<int SwizT> - static Integer shuffle_epi32(Integer a, Integer b); - template<int SwizT> - static Integer shuffle_epi64(Integer a, Integer b); - static Integer shuffle_epi8(Integer a, Integer b); - template<int SwizT> - static Float shuffle_pd(Double a, Double b); - template<int SwizT> - static Float shuffle_ps(Float a, Float b); - static Integer unpackhi_epi16(Integer a, Integer b); - static Integer unpackhi_epi32(Integer a, Integer b); - static Integer unpackhi_epi64(Integer a, Integer b); - static Integer unpackhi_epi8(Integer a, Integer b); - static Float unpackhi_pd(Double a, Double b); - static Float unpackhi_ps(Float a, Float b); - static Integer unpacklo_epi16(Integer a, Integer b); - static Integer unpacklo_epi32(Integer a, Integer b); - static Integer unpacklo_epi64(Integer a, Integer b); - static Integer unpacklo_epi8(Integer a, Integer b); - static Float unpacklo_pd(Double a, Double b); - static Float unpacklo_ps(Float a, Float b); - - //----------------------------------------------------------------------- - // Load / store operations - //----------------------------------------------------------------------- - enum class ScaleFactor - { - SF_1, // No scaling - SF_2, // Scale offset by 2 - SF_4, // Scale offset by 4 - SF_8, // Scale offset by 8 - }; - - template<ScaleFactor ScaleT = ScaleFactor::SF_1> - static Float i32gather_ps(float const* p, Integer idx); // return *(float*)(((int8*)p) + (idx * ScaleT)) - static Float load1_ps(float const *p); // return *p (broadcast 1 value to all elements) - static Float load_ps(float const *p); // return *p (loads SIMD width elements from memory) - static Integer load_si(Integer const *p); // return *p - static Float loadu_ps(float const *p); // return *p (same as load_ps but allows for unaligned mem) - static Integer loadu_si(Integer const *p); // return *p (same as load_si but allows for unaligned mem) - - // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old - template<int ScaleT> - static Float mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask); - - static void maskstore_ps(float *p, Integer mask, Float src); - static int movemask_epi8(Integer a); - static int movemask_pd(Double a); - static int movemask_ps(Float a); - static Integer set1_epi32(int i); // return i (all elements are same value) - static Integer set1_epi8(char i); // return i (all elements are same value) - static Float set1_ps(float f); // return f (all elements are same value) - static Float setzero_ps(); // return 0 (float) - static Integer setzero_si(); // return 0 (integer) - static void store_ps(float *p, Float a); // *p = a (stores all elements contiguously in memory) - static void store_si(Integer *p, Integer a); // *p = a - static void stream_ps(float *p, Float a); // *p = a (same as store_ps, but doesn't keep memory in cache) - - //======================================================================= - // Legacy interface (available only in SIMD256 width) - //======================================================================= - - static Float broadcast_ps(__m128 const *p); - template<int ImmT> - static __m128d extractf128_pd(Double a); - template<int ImmT> - static __m128 extractf128_ps(Float a); - template<int ImmT> - static __m128i extractf128_si(Integer a); - template<int ImmT> - static Double insertf128_pd(Double a, __m128d b); - template<int ImmT> - static Float insertf128_ps(Float a, __m128 b); - template<int ImmT> - static Integer insertf128_si(Integer a, __m128i b); - static Integer loadu2_si(__m128 const* phi, __m128 const* plo); - template<int ImmT> - static Double permute2f128_pd(Double a, Double b); - template<int ImmT> - static Float permute2f128_ps(Float a, Float b); - template<int ImmT> - static Integer permute2f128_si(Integer a, Integer b); - static Integer set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0); - static void storeu2_si(__m128i *phi, __m128i *plo, Integer src); - - //======================================================================= - // Advanced masking interface (currently available only in SIMD16 width) - //======================================================================= -}; -#endif // #if 0 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp deleted file mode 100644 index 3ef847d4ca4..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp +++ /dev/null @@ -1,457 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ -#pragma once - -#if !defined(__cplusplus) -#error C++ compilation required -#endif - -#include <immintrin.h> -#include <inttypes.h> -#include <stdint.h> - -#define SIMD_ARCH_AVX 0 -#define SIMD_ARCH_AVX2 1 -#define SIMD_ARCH_AVX512 2 - -#if !defined(SIMD_ARCH) -#define SIMD_ARCH SIMD_ARCH_AVX -#endif - -#if defined(_MSC_VER) -#define SIMDCALL __vectorcall -#define SIMDINLINE __forceinline -#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_ -#else -#define SIMDCALL -#define SIMDINLINE inline -#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_))) -#endif - -// For documentation, please see the following include... -// #include "simdlib_interface.hpp" - -namespace SIMDImpl -{ - enum class CompareType - { - EQ_OQ = 0x00, // Equal (ordered, nonsignaling) - LT_OS = 0x01, // Less-than (ordered, signaling) - LE_OS = 0x02, // Less-than-or-equal (ordered, signaling) - UNORD_Q = 0x03, // Unordered (nonsignaling) - NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling) - NLT_US = 0x05, // Not-less-than (unordered, signaling) - NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling) - ORD_Q = 0x07, // Ordered (nonsignaling) - EQ_UQ = 0x08, // Equal (unordered, non-signaling) - NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling) - NGT_US = 0x0A, // Not-greater-than (unordered, signaling) - FALSE_OQ = 0x0B, // False (ordered, nonsignaling) - NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling) - GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling) - GT_OS = 0x0E, // Greater-than (ordered, signaling) - TRUE_UQ = 0x0F, // True (unordered, non-signaling) - EQ_OS = 0x10, // Equal (ordered, signaling) - LT_OQ = 0x11, // Less-than (ordered, nonsignaling) - LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling) - UNORD_S = 0x13, // Unordered (signaling) - NEQ_US = 0x14, // Not-equal (unordered, signaling) - NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling) - NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling) - ORD_S = 0x17, // Ordered (signaling) - EQ_US = 0x18, // Equal (unordered, signaling) - NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling) - NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling) - FALSE_OS = 0x1B, // False (ordered, signaling) - NEQ_OS = 0x1C, // Not-equal (ordered, signaling) - GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling) - GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling) - TRUE_US = 0x1F, // True (unordered, signaling) - }; - -#if SIMD_ARCH >= SIMD_ARCH_AVX512 - enum class CompareTypeInt - { - EQ = _MM_CMPINT_EQ, // Equal - LT = _MM_CMPINT_LT, // Less than - LE = _MM_CMPINT_LE, // Less than or Equal - NE = _MM_CMPINT_NE, // Not Equal - GE = _MM_CMPINT_GE, // Greater than or Equal - GT = _MM_CMPINT_GT, // Greater than - }; -#endif // SIMD_ARCH >= SIMD_ARCH_AVX512 - - enum class ScaleFactor - { - SF_1 = 1, // No scaling - SF_2 = 2, // Scale offset by 2 - SF_4 = 4, // Scale offset by 4 - SF_8 = 8, // Scale offset by 8 - }; - - enum class RoundMode - { - TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5) - TO_NEG_INF = 0x01, // Round to negative infinity - TO_POS_INF = 0x02, // Round to positive infinity - TO_ZERO = 0x03, // Round to 0 a.k.a. truncate - CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register - - RAISE_EXC = 0x00, // Raise exception on overflow - NO_EXC = 0x08, // Suppress exceptions - - NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC), - NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC), - FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC), - FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC), - CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC), - CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC), - TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC), - TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC), - RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC), - NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC), - }; - - struct Traits - { - using CompareType = SIMDImpl::CompareType; - using ScaleFactor = SIMDImpl::ScaleFactor; - using RoundMode = SIMDImpl::RoundMode; - }; - - // Attribute, 4-dimensional attribute in SIMD SOA layout - template <typename Float, typename Integer, typename Double> - union Vec4 - { - Float v[4]; - Integer vi[4]; - Double vd[4]; - struct - { - Float x; - Float y; - Float z; - Float w; - }; - SIMDINLINE Float& SIMDCALL operator[](const int i) { return v[i]; } - SIMDINLINE Float const& SIMDCALL operator[](const int i) const { return v[i]; } - SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const& in) - { - v[0] = in.v[0]; - v[1] = in.v[1]; - v[2] = in.v[2]; - v[3] = in.v[3]; - return *this; - } - }; - - namespace SIMD128Impl - { - union Float - { - SIMDINLINE Float() = default; - SIMDINLINE Float(__m128 in) : v(in) {} - SIMDINLINE Float& SIMDCALL operator=(__m128 in) - { - v = in; - return *this; - } - SIMDINLINE Float& SIMDCALL operator=(Float const& in) - { - v = in.v; - return *this; - } - SIMDINLINE SIMDCALL operator __m128() const { return v; } - - SIMDALIGN(__m128, 16) v; - }; - - union Integer - { - SIMDINLINE Integer() = default; - SIMDINLINE Integer(__m128i in) : v(in) {} - SIMDINLINE Integer& SIMDCALL operator=(__m128i in) - { - v = in; - return *this; - } - SIMDINLINE Integer& SIMDCALL operator=(Integer const& in) - { - v = in.v; - return *this; - } - SIMDINLINE SIMDCALL operator __m128i() const { return v; } - - SIMDALIGN(__m128i, 16) v; - }; - - union Double - { - SIMDINLINE Double() = default; - SIMDINLINE Double(__m128d in) : v(in) {} - SIMDINLINE Double& SIMDCALL operator=(__m128d in) - { - v = in; - return *this; - } - SIMDINLINE Double& SIMDCALL operator=(Double const& in) - { - v = in.v; - return *this; - } - SIMDINLINE SIMDCALL operator __m128d() const { return v; } - - SIMDALIGN(__m128d, 16) v; - }; - - using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>; - using Mask = uint8_t; - - static const uint32_t SIMD_WIDTH = 4; - } // namespace SIMD128Impl - - namespace SIMD256Impl - { - union Float - { - SIMDINLINE Float() = default; - SIMDINLINE Float(__m256 in) : v(in) {} - SIMDINLINE Float(SIMD128Impl::Float const& in_lo, - SIMD128Impl::Float const& in_hi = _mm_setzero_ps()) - { - v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1); - } - SIMDINLINE Float& SIMDCALL operator=(__m256 in) - { - v = in; - return *this; - } - SIMDINLINE Float& SIMDCALL operator=(Float const& in) - { - v = in.v; - return *this; - } - SIMDINLINE SIMDCALL operator __m256() const { return v; } - - SIMDALIGN(__m256, 32) v; - SIMD128Impl::Float v4[2]; - }; - - union Integer - { - SIMDINLINE Integer() = default; - SIMDINLINE Integer(__m256i in) : v(in) {} - SIMDINLINE Integer(SIMD128Impl::Integer const& in_lo, - SIMD128Impl::Integer const& in_hi = _mm_setzero_si128()) - { - v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1); - } - SIMDINLINE Integer& SIMDCALL operator=(__m256i in) - { - v = in; - return *this; - } - SIMDINLINE Integer& SIMDCALL operator=(Integer const& in) - { - v = in.v; - return *this; - } - SIMDINLINE SIMDCALL operator __m256i() const { return v; } - - SIMDALIGN(__m256i, 32) v; - SIMD128Impl::Integer v4[2]; - }; - - union Double - { - SIMDINLINE Double() = default; - SIMDINLINE Double(__m256d const& in) : v(in) {} - SIMDINLINE Double(SIMD128Impl::Double const& in_lo, - SIMD128Impl::Double const& in_hi = _mm_setzero_pd()) - { - v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1); - } - SIMDINLINE Double& SIMDCALL operator=(__m256d in) - { - v = in; - return *this; - } - SIMDINLINE Double& SIMDCALL operator=(Double const& in) - { - v = in.v; - return *this; - } - SIMDINLINE SIMDCALL operator __m256d() const { return v; } - - SIMDALIGN(__m256d, 32) v; - SIMD128Impl::Double v4[2]; - }; - - using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>; - using Mask = uint8_t; - - static const uint32_t SIMD_WIDTH = 8; - } // namespace SIMD256Impl - - namespace SIMD512Impl - { -#if !(defined(__AVX512F__) || defined(_ZMMINTRIN_H_INCLUDED)) - // Define AVX512 types if not included via immintrin.h. - // All data members of these types are ONLY to viewed - // in a debugger. Do NOT access them via code! - union __m512 - { - private: - float m512_f32[16]; - }; - struct __m512d - { - private: - double m512d_f64[8]; - }; - - union __m512i - { - private: - int8_t m512i_i8[64]; - int16_t m512i_i16[32]; - int32_t m512i_i32[16]; - int64_t m512i_i64[8]; - uint8_t m512i_u8[64]; - uint16_t m512i_u16[32]; - uint32_t m512i_u32[16]; - uint64_t m512i_u64[8]; - }; - - using __mmask16 = uint16_t; -#endif - -#if defined(__INTEL_COMPILER) || (SIMD_ARCH >= SIMD_ARCH_AVX512) -#define SIMD_ALIGNMENT_BYTES 64 -#else -#define SIMD_ALIGNMENT_BYTES 32 -#endif - - union Float - { - SIMDINLINE Float() = default; - SIMDINLINE Float(__m512 in) : v(in) {} - SIMDINLINE Float(SIMD256Impl::Float const& in_lo, - SIMD256Impl::Float const& in_hi = _mm256_setzero_ps()) - { - v8[0] = in_lo; - v8[1] = in_hi; - } - SIMDINLINE Float& SIMDCALL operator=(__m512 in) - { - v = in; - return *this; - } - SIMDINLINE Float& SIMDCALL operator=(Float const& in) - { -#if SIMD_ARCH >= SIMD_ARCH_AVX512 - v = in.v; -#else - v8[0] = in.v8[0]; - v8[1] = in.v8[1]; -#endif - return *this; - } - SIMDINLINE SIMDCALL operator __m512() const { return v; } - - SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v; - SIMD256Impl::Float v8[2]; - }; - - union Integer - { - SIMDINLINE Integer() = default; - SIMDINLINE Integer(__m512i in) : v(in) {} - SIMDINLINE Integer(SIMD256Impl::Integer const& in_lo, - SIMD256Impl::Integer const& in_hi = _mm256_setzero_si256()) - { - v8[0] = in_lo; - v8[1] = in_hi; - } - SIMDINLINE Integer& SIMDCALL operator=(__m512i in) - { - v = in; - return *this; - } - SIMDINLINE Integer& SIMDCALL operator=(Integer const& in) - { -#if SIMD_ARCH >= SIMD_ARCH_AVX512 - v = in.v; -#else - v8[0] = in.v8[0]; - v8[1] = in.v8[1]; -#endif - return *this; - } - - SIMDINLINE SIMDCALL operator __m512i() const { return v; } - - SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v; - SIMD256Impl::Integer v8[2]; - }; - - union Double - { - SIMDINLINE Double() = default; - SIMDINLINE Double(__m512d in) : v(in) {} - SIMDINLINE Double(SIMD256Impl::Double const& in_lo, - SIMD256Impl::Double const& in_hi = _mm256_setzero_pd()) - { - v8[0] = in_lo; - v8[1] = in_hi; - } - SIMDINLINE Double& SIMDCALL operator=(__m512d in) - { - v = in; - return *this; - } - SIMDINLINE Double& SIMDCALL operator=(Double const& in) - { -#if SIMD_ARCH >= SIMD_ARCH_AVX512 - v = in.v; -#else - v8[0] = in.v8[0]; - v8[1] = in.v8[1]; -#endif - return *this; - } - - SIMDINLINE SIMDCALL operator __m512d() const { return v; } - - SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v; - SIMD256Impl::Double v8[2]; - }; - - typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64); - using Mask = __mmask16; - - static const uint32_t SIMD_WIDTH = 16; - -#undef SIMD_ALIGNMENT_BYTES - } // namespace SIMD512Impl -} // namespace SIMDImpl diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp deleted file mode 100644 index 0f5382044c2..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp +++ /dev/null @@ -1,299 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#include "common/os.h" -#include <stdarg.h> -#include <stdio.h> -#include <assert.h> -#include <algorithm> -#include <mutex> - -#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS - -#if defined(_MSC_VER) -#pragma comment(lib, "user32.lib") -#endif // _WIN32 - -namespace ConsoleUtils -{ - enum class TextColor - { - BLACK = 0, -#if defined(_WIN32) - RED = 4, - GREEN = 2, - BLUE = 1, -#else - RED = 1, - GREEN = 2, - BLUE = 4, -#endif // _WIN32 - PURPLE = static_cast<uint32_t>(RED) | static_cast<uint32_t>(BLUE), - CYAN = static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE), - YELLOW = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN), - WHITE = - static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE), - }; - - enum class TextStyle - { - NORMAL = 0, - INTENSITY = 1, - }; - - void SetTextColor(FILE* stream, - TextColor color = TextColor::WHITE, - TextStyle style = TextStyle::NORMAL) - { -#if defined(_WIN32) - - HANDLE hConsoleHandle = nullptr; - if (stream == stderr) - { - hConsoleHandle = GetStdHandle(STD_ERROR_HANDLE); - } - else if (stream == stdout) - { - hConsoleHandle = GetStdHandle(STD_OUTPUT_HANDLE); - } - else - { - // Not a console stream, do nothing - return; - } - - WORD textAttributes = static_cast<WORD>(color); - if (style == TextStyle::INTENSITY) - { - textAttributes |= FOREGROUND_INTENSITY; - } - SetConsoleTextAttribute(hConsoleHandle, textAttributes); - -#else // !_WIN32 - - // Print ANSI codes - uint32_t cc = - 30 + ((style == TextStyle::INTENSITY) ? 60 : 0) + static_cast<uint32_t>(color); - fprintf(stream, "\033[0m\033[%d;%dm", static_cast<uint32_t>(style), cc); - -#endif - } - - void ResetTextColor(FILE* stream) - { -#if defined(_WIN32) - - SetTextColor(stream); - -#else // !_WIN32 - - // Print ANSI codes - fprintf(stream, "\033[0m"); - -#endif - } - - static std::mutex g_stderrMutex; -} // namespace ConsoleUtils - -bool SwrAssert(bool chkDebugger, - bool& enabled, - const char* pExpression, - const char* pFileName, - uint32_t lineNum, - const char* pFunction, - const char* pFmtString, - ...) -{ - using namespace ConsoleUtils; - std::lock_guard<std::mutex> l(g_stderrMutex); - - SetTextColor(stderr, TextColor::CYAN, TextStyle::NORMAL); - - fprintf(stderr, "%s(%d): ", pFileName, lineNum); - - SetTextColor(stderr, TextColor::RED, TextStyle::INTENSITY); - - fprintf(stderr, "ASSERT: %s\n", pExpression); - - SetTextColor(stderr, TextColor::CYAN, TextStyle::INTENSITY); - fprintf(stderr, "\t%s\n", pFunction); - - if (pFmtString) - { - SetTextColor(stderr, TextColor::YELLOW, TextStyle::INTENSITY); - fprintf(stderr, "\t"); - va_list args; - va_start(args, pFmtString); - vfprintf(stderr, pFmtString, args); - va_end(args); - fprintf(stderr, "\n"); - } - ResetTextColor(stderr); - fflush(stderr); - -#if defined(_WIN32) - static const int MAX_MESSAGE_LEN = 2048; - char msgBuf[MAX_MESSAGE_LEN]; - - sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression); - msgBuf[MAX_MESSAGE_LEN - 2] = '\n'; - msgBuf[MAX_MESSAGE_LEN - 1] = 0; - OutputDebugStringA(msgBuf); - - sprintf_s(msgBuf, "\t%s\n", pFunction); - msgBuf[MAX_MESSAGE_LEN - 2] = '\n'; - msgBuf[MAX_MESSAGE_LEN - 1] = 0; - OutputDebugStringA(msgBuf); - - int offset = 0; - - if (pFmtString) - { - va_list args; - va_start(args, pFmtString); - offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args); - va_end(args); - - if (offset < 0) - { - return true; - } - - OutputDebugStringA("\t"); - OutputDebugStringA(msgBuf); - OutputDebugStringA("\n"); - } - - if (enabled && KNOB_ENABLE_ASSERT_DIALOGS) - { - int retval = sprintf_s(&msgBuf[offset], - MAX_MESSAGE_LEN - offset, - "\n\n" - "File: %s\n" - "Line: %d\n" - "\n" - "Expression: %s\n\n" - "Cancel: Disable this assert for the remainder of the process\n" - "Try Again: Break into the debugger\n" - "Continue: Continue execution (but leave assert enabled)", - pFileName, - lineNum, - pExpression); - - if (retval < 0) - { - return true; - } - - offset += retval; - - if (!IsDebuggerPresent()) - { - sprintf_s(&msgBuf[offset], - MAX_MESSAGE_LEN - offset, - "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a " - "program crash!"); - } - - retval = MessageBoxA(nullptr, - msgBuf, - "Assert Failed", - MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION | MB_SETFOREGROUND); - - switch (retval) - { - case IDCANCEL: - enabled = false; - return false; - - case IDTRYAGAIN: - return true; - - case IDCONTINUE: - return false; - } - } - else - { - return (IsDebuggerPresent() || !chkDebugger) && enabled; - } -#endif // _WIN32 - - return enabled; -} - -void SwrTrace( - const char* pFileName, uint32_t lineNum, const char* pFunction, const char* pFmtString, ...) -{ - using namespace ConsoleUtils; - std::lock_guard<std::mutex> l(g_stderrMutex); - - SetTextColor(stderr, TextColor::CYAN, TextStyle::NORMAL); - - fprintf(stderr, "%s(%d): TRACE in %s:\n", pFileName, lineNum, pFunction); - - if (pFmtString) - { - SetTextColor(stderr, TextColor::PURPLE, TextStyle::INTENSITY); - fprintf(stderr, "\t"); - va_list args; - va_start(args, pFmtString); - vfprintf(stderr, pFmtString, args); - va_end(args); - fprintf(stderr, "\n"); - } - ResetTextColor(stderr); - fflush(stderr); - -#if defined(_WIN32) - static const int MAX_MESSAGE_LEN = 2048; - char msgBuf[MAX_MESSAGE_LEN]; - - sprintf_s(msgBuf, "%s(%d): TRACE in %s\n", pFileName, lineNum, pFunction); - msgBuf[MAX_MESSAGE_LEN - 2] = '\n'; - msgBuf[MAX_MESSAGE_LEN - 1] = 0; - OutputDebugStringA(msgBuf); - - int offset = 0; - - if (pFmtString) - { - va_list args; - va_start(args, pFmtString); - offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args); - va_end(args); - - if (offset < 0) - { - return; - } - - OutputDebugStringA("\t"); - OutputDebugStringA(msgBuf); - OutputDebugStringA("\n"); - } -#endif // _WIN32 -} - -#endif // SWR_ENABLE_ASSERTS diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h deleted file mode 100644 index cd9854f2549..00000000000 --- a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h +++ /dev/null @@ -1,242 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#ifndef __SWR_ASSERT_H__ -#define __SWR_ASSERT_H__ - -#if !defined(__SWR_OS_H__) -#error swr_assert.h should not be included directly, please include "common/os.h" instead. -#endif - -//============================================================================= -// -// MACROS defined in this file: -// -// - SWR_ASSUME(expression, ...): Tell compiler that the expression is true. -// Helps with static code analysis as well. -// DO NOT USE if code after this dynamically -// checks for errors and handles them. The -// compiler may optimize out the error check. -// -// - SWR_ASSERT(expression, ...): Inform the user is expression is false. -// This check is only conditionally made, -// usually only in debug mode. -// -// - SWR_REL_ASSERT(expression, ...): Unconditionally enabled version of SWR_ASSERT -// -// - SWR_ASSUME_ASSERT(expression, ...): Conditionally enabled SWR_ASSERT. Uses -// SWR_ASSUME if SWR_ASSERT is disabled. -// DO NOT USE in combination with actual -// error checking (see SWR_ASSUME) -// -// - SWR_REL_ASSUME_ASSERT(expression, ...): Same as SWR_REL_ASSERT. -// -//============================================================================= - -// Stupid preprocessor tricks to avoid -Wall / -W4 warnings -#if defined(_MSC_VER) -#define _SWR_WARN_DISABLE __pragma(warning(push)) __pragma(warning(disable : 4127)) -#define _SWR_WARN_RESTORE __pragma(warning(pop)) -#else // ! MSVC compiler -#define _SWR_WARN_DISABLE -#define _SWR_WARN_RESTORE -#endif - -#define _SWR_MACRO_START \ - do \ - { -#define _SWR_MACRO_END \ - _SWR_WARN_DISABLE \ - } \ - while (0) \ - _SWR_WARN_RESTORE - -#if defined(_MSC_VER) -#define SWR_ASSUME(e, ...) \ - _SWR_MACRO_START __assume(e); \ - _SWR_MACRO_END -#elif defined(__clang__) -#define SWR_ASSUME(e, ...) \ - _SWR_MACRO_START __builtin_assume(e); \ - _SWR_MACRO_END -#elif defined(__GNUC__) -#define SWR_ASSUME(e, ...) \ - _SWR_MACRO_START((e) ? ((void)0) : __builtin_unreachable()); \ - _SWR_MACRO_END -#else -#define SWR_ASSUME(e, ...) \ - _SWR_MACRO_START ASSUME(e); \ - _SWR_MACRO_END -#endif - -#if !defined(SWR_ENABLE_ASSERTS) - -#if !defined(NDEBUG) -#define SWR_ENABLE_ASSERTS 1 -#else -#define SWR_ENABLE_ASSERTS 0 -#endif // _DEBUG - -#endif // SWR_ENABLE_ASSERTS - -#if !defined(SWR_ENABLE_REL_ASSERTS) -#define SWR_ENABLE_REL_ASSERTS 1 -#endif - -#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS -#include "assert.h" - -#if !defined(__cplusplus) - -#pragma message("C++ is required for SWR Asserts, falling back to assert.h") - -#if SWR_ENABLE_ASSERTS -#define SWR_ASSERT(e, ...) assert(e) -#endif - -#if SWR_ENABLE_REL_ASSERTS -#define SWR_REL_ASSERT(e, ...) assert(e) -#endif - -#else - -bool SwrAssert(bool chkDebugger, - bool& enabled, - const char* pExpression, - const char* pFileName, - uint32_t lineNum, - const char* function, - const char* pFmtString = nullptr, - ...); - -void SwrTrace( - const char* pFileName, uint32_t lineNum, const char* function, const char* pFmtString, ...); - -#define _SWR_ASSERT(chkDebugger, e, ...) \ - _SWR_MACRO_START \ - bool expFailed = !(e); \ - if (expFailed) \ - { \ - static bool swrAssertEnabled = true; \ - expFailed = SwrAssert( \ - chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \ - if (expFailed) \ - { \ - DEBUGBREAK; \ - } \ - } \ - _SWR_MACRO_END - -#define _SWR_INVALID(chkDebugger, ...) \ - _SWR_MACRO_START \ - static bool swrAssertEnabled = true; \ - bool expFailed = SwrAssert( \ - chkDebugger, swrAssertEnabled, "", __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \ - if (expFailed) \ - { \ - DEBUGBREAK; \ - } \ - _SWR_MACRO_END - -#define _SWR_TRACE(_fmtstr, ...) SwrTrace(__FILE__, __LINE__, __FUNCTION__, _fmtstr, ##__VA_ARGS__); - -#if SWR_ENABLE_ASSERTS -#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__) -#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSERT(e, ##__VA_ARGS__) -#define SWR_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__) -#endif // SWR_ENABLE_ASSERTS - -#if SWR_ENABLE_REL_ASSERTS -#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__) -#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_REL_ASSERT(e, ##__VA_ARGS__) -#define SWR_REL_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__) - -// SWR_INVALID is always enabled -// Funky handling to allow 0 arguments with g++/gcc -// This is needed because you can't "swallow commas" with ##_VA_ARGS__ unless -// there is a first argument to the macro. So having a macro that can optionally -// accept 0 arguments is tricky. -#define _SWR_INVALID_0() _SWR_INVALID(false) -#define _SWR_INVALID_1(...) _SWR_INVALID(false, ##__VA_ARGS__) -#define _SWR_INVALID_VARGS_(_10, _9, _8, _7, _6, _5, _4, _3, _2, _1, N, ...) N -#define _SWR_INVALID_VARGS(...) _SWR_INVALID_VARGS_(__VA_ARGS__, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) -#define _SWR_INVALID_VARGS_0() 1, 2, 3, 4, 5, 6, 7, 9, 9, 10 -#define _SWR_INVALID_CONCAT_(a, b) a##b -#define _SWR_INVALID_CONCAT(a, b) _SWR_INVALID_CONCAT_(a, b) -#define SWR_INVALID(...) \ - _SWR_INVALID_CONCAT(_SWR_INVALID_, _SWR_INVALID_VARGS(_SWR_INVALID_VARGS_0 __VA_ARGS__())) \ - (__VA_ARGS__) - -#define SWR_STATIC_ASSERT(expression, ...) \ - static_assert((expression), "Failed:\n " #expression "\n " __VA_ARGS__); - -#endif // SWR_ENABLE_REL_ASSERTS - -#endif // C++ - -#endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS - -// Needed to allow passing bitfield members to sizeof() in disabled asserts -template <typename T> -static bool SwrSizeofWorkaround(T) -{ - return false; -} - -#if !SWR_ENABLE_ASSERTS -#define SWR_ASSERT(e, ...) \ - _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \ - _SWR_MACRO_END -#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__) -#define SWR_TRACE(_fmtstr, ...) \ - _SWR_MACRO_START(void)(0); \ - _SWR_MACRO_END -#endif - -#if !SWR_ENABLE_REL_ASSERTS -#define SWR_REL_ASSERT(e, ...) \ - _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \ - _SWR_MACRO_END -#define SWR_INVALID(...) \ - _SWR_MACRO_START(void)(0); \ - _SWR_MACRO_END -#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__) -#define SWR_REL_TRACE(_fmtstr, ...) \ - _SWR_MACRO_START(void)(0); \ - _SWR_MACRO_END -#define SWR_STATIC_ASSERT(e, ...) \ - _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \ - _SWR_MACRO_END -#endif - -#if defined(_MSC_VER) -#define SWR_FUNCTION_DECL __FUNCSIG__ -#elif (defined(__GNUC__) || defined(__clang__)) -#define SWR_FUNCTION_DECL __PRETTY_FUNCTION__ -#else -#define SWR_FUNCTION_DECL __FUNCTION__ -#endif - -#define SWR_NOT_IMPL SWR_INVALID("%s not implemented", SWR_FUNCTION_DECL) - -#endif //__SWR_ASSERT_H__ diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp deleted file mode 100644 index bee257d7723..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ /dev/null @@ -1,1802 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file api.cpp - * - * @brief API implementation - * - ******************************************************************************/ - -#include <cfloat> -#include <cmath> -#include <cstdio> -#include <new> - -#include "core/api.h" -#include "core/backend.h" -#include "core/context.h" -#include "core/depthstencil.h" -#include "core/frontend.h" -#include "core/rasterizer.h" -#include "core/rdtsc_core.h" -#include "core/threads.h" -#include "core/tilemgr.h" -#include "core/clip.h" -#include "core/utils.h" -#include "core/tileset.h" - -#include "common/os.h" - -static const SWR_RECT g_MaxScissorRect = {0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y}; - -void SetupDefaultState(SWR_CONTEXT* pContext); - -static INLINE SWR_CONTEXT* GetContext(HANDLE hContext) -{ - return (SWR_CONTEXT*)hContext; -} - -void WakeAllThreads(SWR_CONTEXT* pContext) -{ - pContext->FifosNotEmpty.notify_all(); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Create SWR Context. -/// @param pCreateInfo - pointer to creation info. -HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo) -{ - void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4); - memset(pContextMem, 0, sizeof(SWR_CONTEXT)); - SWR_CONTEXT* pContext = new (pContextMem) SWR_CONTEXT(); - - pContext->privateStateSize = pCreateInfo->privateStateSize; - - // initialize callback functions - pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; - pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; - pContext->pfnTranslateGfxptrForRead = pCreateInfo->pfnTranslateGfxptrForRead; - pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite; - pContext->pfnMakeGfxPtr = pCreateInfo->pfnMakeGfxPtr; - pContext->pfnCreateMemoryContext = pCreateInfo->pfnCreateMemoryContext; - pContext->pfnDestroyMemoryContext = pCreateInfo->pfnDestroyMemoryContext; - pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset; - pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats; - pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE; - pContext->pfnUpdateStreamOut = pCreateInfo->pfnUpdateStreamOut; - - - pContext->hExternalMemory = pCreateInfo->hExternalMemory; - - pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT; - if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0) - { - pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT; - } - - pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT); - pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT); - - pContext->pMacroTileManagerArray = - (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64); - pContext->pDispatchQueueArray = - (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64); - - for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc) - { - pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); - new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena); - new (&pContext->pDispatchQueueArray[dc]) DispatchQueue(); - - pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); - } - - if (pCreateInfo->pThreadInfo) - { - pContext->threadInfo = *pCreateInfo->pThreadInfo; - } - else - { - pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; - pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE; - pContext->threadInfo.BASE_CORE = KNOB_BASE_CORE; - pContext->threadInfo.BASE_THREAD = KNOB_BASE_THREAD; - pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; - pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; - pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; - pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; - } - - if (pCreateInfo->pApiThreadInfo) - { - pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo; - } - else - { - pContext->apiThreadInfo.bindAPIThread0 = true; - pContext->apiThreadInfo.numAPIReservedThreads = 1; - pContext->apiThreadInfo.numAPIThreadsPerCore = 1; - } - - if (pCreateInfo->pWorkerPrivateState) - { - pContext->workerPrivateState = *pCreateInfo->pWorkerPrivateState; - } - - memset((void*)&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); - memset((void*)&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); - new (&pContext->WaitLock) std::mutex(); - new (&pContext->FifosNotEmpty) std::condition_variable(); - - CreateThreadPool(pContext, &pContext->threadPool); - - if (pContext->apiThreadInfo.bindAPIThread0) - { - BindApiThread(pContext, 0); - } - - if (pContext->threadInfo.SINGLE_THREADED) - { - pContext->pSingleThreadLockedTiles = new TileSet(); - } - - pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads]; - pContext->pStats = - (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64); - -#if defined(KNOB_ENABLE_AR) - // Setup ArchRast thread contexts which includes +1 for API thread. - pContext->pArContext = new HANDLE[pContext->NumWorkerThreads + 1]; - pContext->pArContext[pContext->NumWorkerThreads] = - ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API); -#endif - -#if defined(KNOB_ENABLE_RDTSC) - pContext->pBucketMgr = new BucketManager(pCreateInfo->contextName); - RDTSC_RESET(pContext->pBucketMgr); - RDTSC_INIT(pContext->pBucketMgr, 0); -#endif - - // Allocate scratch space for workers. - ///@note We could lazily allocate this but its rather small amount of memory. - for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) - { -#if defined(_WIN32) - uint32_t numaNode = - pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0; - pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(GetCurrentProcess(), - nullptr, - KNOB_WORKER_SCRATCH_SPACE_SIZE, - MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE, - numaNode); -#else - pContext->ppScratch[i] = - (uint8_t*)AlignedMalloc(KNOB_WORKER_SCRATCH_SPACE_SIZE, KNOB_SIMD_WIDTH * 4); -#endif - -#if defined(KNOB_ENABLE_AR) - // Initialize worker thread context for ArchRast. - pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER); - - SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData; - pWorkerData->hArContext = pContext->pArContext[i]; -#endif - - - } - -#if defined(KNOB_ENABLE_AR) - // cache the API thread event manager, for use with sim layer - pCreateInfo->hArEventManager = pContext->pArContext[pContext->NumWorkerThreads]; -#endif - - // State setup AFTER context is fully initialized - SetupDefaultState(pContext); - - // initialize hot tile manager - pContext->pHotTileMgr = new HotTileMgr(); - - // pass pointer to bucket manager back to caller -#ifdef KNOB_ENABLE_RDTSC - pCreateInfo->pBucketMgr = pContext->pBucketMgr; -#endif - - pCreateInfo->contextSaveSize = sizeof(API_STATE); - - StartThreadPool(pContext, &pContext->threadPool); - - return (HANDLE)pContext; -} - -void CopyState(DRAW_STATE& dst, const DRAW_STATE& src) -{ - memcpy((void*)&dst.state, (void*)&src.state, sizeof(API_STATE)); -} - -template <bool IsDraw> -void QueueWork(SWR_CONTEXT* pContext) -{ - DRAW_CONTEXT* pDC = pContext->pCurDrawContext; - uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT; - - if (IsDraw) - { - pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex]; - pDC->pTileMgr->initialize(); - } - - // Each worker thread looks at a DC for both FE and BE work at different times and so we - // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers - // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and - // then moved on if all work is done.) - pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads; - - if (IsDraw) - { - InterlockedIncrement(&pContext->drawsOutstandingFE); - } - - _ReadWriteBarrier(); - { - std::unique_lock<std::mutex> lock(pContext->WaitLock); - pContext->dcRing.Enqueue(); - } - - if (pContext->threadInfo.SINGLE_THREADED) - { - uint32_t mxcsr = SetOptimalVectorCSR(); - - if (IsDraw) - { - uint32_t curDraw[2] = {pContext->pCurDrawContext->drawId, - pContext->pCurDrawContext->drawId}; - WorkOnFifoFE(pContext, 0, curDraw[0]); - WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0); - } - else - { - uint32_t curDispatch = pContext->pCurDrawContext->drawId; - WorkOnCompute(pContext, 0, curDispatch); - } - - // Dequeue the work here, if not already done, since we're single threaded (i.e. no - // workers). - while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) - { - } - - // restore csr - RestoreVectorCSR(mxcsr); - } - else - { - RDTSC_BEGIN(pContext->pBucketMgr, APIDrawWakeAllThreads, pDC->drawId); - WakeAllThreads(pContext); - RDTSC_END(pContext->pBucketMgr, APIDrawWakeAllThreads, 1); - } - - // Set current draw context to NULL so that next state call forces a new draw context to be - // created and populated. - pContext->pPrevDrawContext = pContext->pCurDrawContext; - pContext->pCurDrawContext = nullptr; -} - -INLINE void QueueDraw(SWR_CONTEXT* pContext) -{ - QueueWork<true>(pContext); -} - -INLINE void QueueDispatch(SWR_CONTEXT* pContext) -{ - QueueWork<false>(pContext); -} - -DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT* pContext, bool isSplitDraw = false) -{ - RDTSC_BEGIN(pContext->pBucketMgr, APIGetDrawContext, 0); - // If current draw context is null then need to obtain a new draw context to use from ring. - if (pContext->pCurDrawContext == nullptr) - { - // Need to wait for a free entry. - while (pContext->dcRing.IsFull()) - { - _mm_pause(); - } - - uint64_t curDraw = pContext->dcRing.GetHead(); - uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT; - - if ((pContext->frameCount - pContext->lastFrameChecked) > 2 || - (curDraw - pContext->lastDrawChecked) > 0x10000) - { - // Take this opportunity to clean-up old arena allocations - pContext->cachingArenaAllocator.FreeOldBlocks(); - - pContext->lastFrameChecked = pContext->frameCount; - pContext->lastDrawChecked = curDraw; - } - - DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; - pContext->pCurDrawContext = pCurDrawContext; - - // Assign next available entry in DS ring to this DC. - uint32_t dsIndex = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT; - pCurDrawContext->pState = &pContext->dsRing[dsIndex]; - - // Copy previous state to current state. - if (pContext->pPrevDrawContext) - { - DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext; - - // If we're splitting our draw then we can just use the same state from the previous - // draw. In this case, we won't increment the DS ring index so the next non-split - // draw can receive the state. - if (isSplitDraw == false) - { - CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); - - // Should have been cleaned up previously - SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); - - pCurDrawContext->pState->pPrivateState = nullptr; - - pContext->curStateId++; // Progress state ring index forward. - } - else - { - // If its a split draw then just copy the state pointer over - // since its the same draw. - pCurDrawContext->pState = pPrevDrawContext->pState; - SWR_ASSERT(pPrevDrawContext->cleanupState == false); - } - } - else - { - SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); - pContext->curStateId++; // Progress state ring index forward. - } - - SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true); - - // Reset dependency - pCurDrawContext->dependent = false; - pCurDrawContext->dependentFE = false; - - pCurDrawContext->pContext = pContext; - pCurDrawContext->isCompute = false; // Dispatch has to set this to true. - - pCurDrawContext->doneFE = false; - pCurDrawContext->FeLock = 0; - pCurDrawContext->threadsDone = 0; - pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr; - - pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads); - - // Assign unique drawId for this DC - pCurDrawContext->drawId = pContext->dcRing.GetHead(); - - pCurDrawContext->cleanupState = true; - } - else - { - SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC"); - } - - RDTSC_END(pContext->pBucketMgr, APIGetDrawContext, 0); - return pContext->pCurDrawContext; -} - -API_STATE* GetDrawState(SWR_CONTEXT* pContext) -{ - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - SWR_ASSERT(pDC->pState != nullptr); - - return &pDC->pState->state; -} - -void SwrDestroyContext(HANDLE hContext) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - pDC->FeWork.type = SHUTDOWN; - pDC->FeWork.pfnWork = ProcessShutdown; - - // enqueue - QueueDraw(pContext); - - DestroyThreadPool(pContext, &pContext->threadPool); - - // free the fifos - for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i) - { - AlignedFree(pContext->dcRing[i].dynState.pStats); - delete pContext->dcRing[i].pArena; - delete pContext->dsRing[i].pArena; - pContext->pMacroTileManagerArray[i].~MacroTileMgr(); - pContext->pDispatchQueueArray[i].~DispatchQueue(); - } - - AlignedFree(pContext->pDispatchQueueArray); - AlignedFree(pContext->pMacroTileManagerArray); - - // Free scratch space. - for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) - { -#if defined(_WIN32) - VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE); -#else - AlignedFree(pContext->ppScratch[i]); -#endif - -#if defined(KNOB_ENABLE_AR) - ArchRast::DestroyThreadContext(pContext->pArContext[i]); -#endif - } - -#if defined(KNOB_ENABLE_RDTSC) - delete pContext->pBucketMgr; -#endif - - delete[] pContext->ppScratch; - AlignedFree(pContext->pStats); - - delete pContext->pHotTileMgr; - delete pContext->pSingleThreadLockedTiles; - - pContext->~SWR_CONTEXT(); - AlignedFree(GetContext(hContext)); -} - -void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - BindApiThread(pContext, apiThreadId); -} - -void SWR_API SwrSaveState(HANDLE hContext, void* pOutputStateBlock, size_t memSize) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - auto pSrc = GetDrawState(pContext); - assert(pOutputStateBlock && memSize >= sizeof(*pSrc)); - - memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc)); -} - -void SWR_API SwrRestoreState(HANDLE hContext, const void* pStateBlock, size_t memSize) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - auto pDst = GetDrawState(pContext); - assert(pStateBlock && memSize >= sizeof(*pDst)); - - memcpy((void*)pDst, (void*)pStateBlock, sizeof(*pDst)); -} - -void SetupDefaultState(SWR_CONTEXT* pContext) -{ - API_STATE* pState = GetDrawState(pContext); - - pState->rastState.cullMode = SWR_CULLMODE_NONE; - pState->rastState.frontWinding = SWR_FRONTWINDING_CCW; - - pState->depthBoundsState.depthBoundsTestEnable = false; - pState->depthBoundsState.depthBoundsTestMinValue = 0.0f; - pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f; -} - -void SWR_API SwrSync(HANDLE hContext, - PFN_CALLBACK_FUNC pfnFunc, - uint64_t userData, - uint64_t userData2, - uint64_t userData3) -{ - SWR_ASSERT(pfnFunc != nullptr); - - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - RDTSC_BEGIN(pContext->pBucketMgr, APISync, 0); - - pDC->FeWork.type = SYNC; - pDC->FeWork.pfnWork = ProcessSync; - - // Setup callback function - pDC->retireCallback.pfnCallbackFunc = pfnFunc; - pDC->retireCallback.userData = userData; - pDC->retireCallback.userData2 = userData2; - pDC->retireCallback.userData3 = userData3; - - AR_API_EVENT(SwrSyncEvent(pDC->drawId)); - - // enqueue - QueueDraw(pContext); - - RDTSC_END(pContext->pBucketMgr, APISync, 1); -} - -void SwrStallBE(HANDLE hContext) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - pDC->dependent = true; -} - -void SwrWaitForIdle(HANDLE hContext) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - - RDTSC_BEGIN(pContext->pBucketMgr, APIWaitForIdle, 0); - - while (!pContext->dcRing.IsEmpty()) - { - _mm_pause(); - } - - RDTSC_END(pContext->pBucketMgr, APIWaitForIdle, 1); -} - -void SwrWaitForIdleFE(HANDLE hContext) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - - RDTSC_BEGIN(pContext->pBucketMgr, APIWaitForIdle, 0); - - while (pContext->drawsOutstandingFE > 0) - { - _mm_pause(); - } - - RDTSC_END(pContext->pBucketMgr, APIWaitForIdle, 1); -} - -void SwrSetVertexBuffers(HANDLE hContext, - uint32_t numBuffers, - const SWR_VERTEX_BUFFER_STATE* pVertexBuffers) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - - for (uint32_t i = 0; i < numBuffers; ++i) - { - const SWR_VERTEX_BUFFER_STATE* pVB = &pVertexBuffers[i]; - pState->vertexBuffers[pVB->index] = *pVB; - } -} - -void SwrSetIndexBuffer(HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - - pState->indexBuffer = *pIndexBuffer; -} - -void SwrSetFetchFunc(HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - - pState->pfnFetchFunc = pfnFetchFunc; -} - -void SwrSetSoFunc(HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - - SWR_ASSERT(streamIndex < MAX_SO_STREAMS); - - pState->pfnSoFunc[streamIndex] = pfnSoFunc; -} - -void SwrSetSoState(HANDLE hContext, SWR_STREAMOUT_STATE* pSoState) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - - pState->soState = *pSoState; -} - -void SwrSetSoBuffers(HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - - SWR_ASSERT((slot < MAX_SO_STREAMS), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot); - - // remember buffer status in case of future resume StreamOut - if ((pState->soBuffer[slot].pBuffer != 0) && (pSoBuffer->pBuffer == 0)) - pState->soPausedBuffer[slot] = pState->soBuffer[slot]; - - // resume - if (pState->soPausedBuffer[slot].pBuffer == pSoBuffer->pBuffer) - pState->soBuffer[slot] = pState->soPausedBuffer[slot]; - else - pState->soBuffer[slot] = *pSoBuffer; -} - -void SwrSetVertexFunc(HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - - pState->pfnVertexFunc = pfnVertexFunc; -} - -void SwrSetFrontendState(HANDLE hContext, SWR_FRONTEND_STATE* pFEState) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - pState->frontendState = *pFEState; -} - -void SwrSetGsState(HANDLE hContext, SWR_GS_STATE* pGSState) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - pState->gsState = *pGSState; -} - -void SwrSetGsFunc(HANDLE hContext, PFN_GS_FUNC pfnGsFunc) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - pState->pfnGsFunc = pfnGsFunc; -} - -void SwrSetCsFunc(HANDLE hContext, - PFN_CS_FUNC pfnCsFunc, - uint32_t totalThreadsInGroup, - uint32_t totalSpillFillSize, - uint32_t scratchSpaceSizePerWarp, - uint32_t numWarps) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - pState->pfnCsFunc = pfnCsFunc; - pState->totalThreadsInGroup = totalThreadsInGroup; - pState->totalSpillFillSize = totalSpillFillSize; - pState->scratchSpaceSizePerWarp = scratchSpaceSizePerWarp; - pState->scratchSpaceNumWarps = numWarps; -} - -void SwrSetTsState(HANDLE hContext, SWR_TS_STATE* pState) -{ - API_STATE* pApiState = GetDrawState(GetContext(hContext)); - pApiState->tsState = *pState; -} - -void SwrSetHsFunc(HANDLE hContext, PFN_HS_FUNC pfnFunc) -{ - API_STATE* pApiState = GetDrawState(GetContext(hContext)); - pApiState->pfnHsFunc = pfnFunc; -} - -void SwrSetDsFunc(HANDLE hContext, PFN_DS_FUNC pfnFunc) -{ - API_STATE* pApiState = GetDrawState(GetContext(hContext)); - pApiState->pfnDsFunc = pfnFunc; -} - -void SwrSetDepthStencilState(HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pDSState) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - - pState->depthStencilState = *pDSState; -} - -void SwrSetBackendState(HANDLE hContext, SWR_BACKEND_STATE* pBEState) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - - pState->backendState = *pBEState; -} - -void SwrSetDepthBoundsState(HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pDBState) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - - pState->depthBoundsState = *pDBState; -} - -void SwrSetPixelShaderState(HANDLE hContext, SWR_PS_STATE* pPSState) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - pState->psState = *pPSState; -} - -void SwrSetBlendState(HANDLE hContext, SWR_BLEND_STATE* pBlendState) -{ - API_STATE* pState = GetDrawState(GetContext(hContext)); - memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE)); -} - -void SwrSetBlendFunc(HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc) -{ - SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS); - API_STATE* pState = GetDrawState(GetContext(hContext)); - pState->pfnBlendFunc[renderTarget] = pfnBlendFunc; -} - -// update guardband multipliers for the viewport -void updateGuardbands(API_STATE* pState) -{ - uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; - - for (uint32_t i = 0; i < numGbs; ++i) - { - // guardband center is viewport center - pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; - pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; - pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; - pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; - } -} - -void SwrSetRastState(HANDLE hContext, const SWR_RASTSTATE* pRastState) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - API_STATE* pState = GetDrawState(pContext); - - memcpy((void*)&pState->rastState, (void*)pRastState, sizeof(SWR_RASTSTATE)); -} - -void SwrSetViewports(HANDLE hContext, - uint32_t numViewports, - const SWR_VIEWPORT* pViewports, - const SWR_VIEWPORT_MATRICES* pMatrices) -{ - SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of viewports."); - - SWR_CONTEXT* pContext = GetContext(hContext); - API_STATE* pState = GetDrawState(pContext); - - memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports); - // @todo Faster to copy portions of the SOA or just copy all of it? - memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES)); -} - -void SwrSetScissorRects(HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors) -{ - SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of scissor rects."); - - API_STATE* pState = GetDrawState(GetContext(hContext)); - memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0])); -}; - -void SetupMacroTileScissors(DRAW_CONTEXT* pDC) -{ - API_STATE* pState = &pDC->pState->state; - uint32_t numScissors = - pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; - pState->scissorsTileAligned = true; - - for (uint32_t index = 0; index < numScissors; ++index) - { - SWR_RECT& scissorInFixedPoint = pState->scissorsInFixedPoint[index]; - - // Set up scissor dimensions based on scissor or viewport - if (pState->rastState.scissorEnable) - { - scissorInFixedPoint = pState->scissorRects[index]; - } - else - { - // the vp width and height must be added to origin un-rounded then the result round to - // -inf. The cast to int works for rounding assuming all [left, right, top, bottom] are - // positive. - scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x; - scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width); - scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y; - scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height); - } - - // Clamp to max rect - scissorInFixedPoint &= g_MaxScissorRect; - - // Test for tile alignment - bool tileAligned; - tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0; - tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0; - tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0; - tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0; - - pState->scissorsTileAligned &= tileAligned; - - // Scale to fixed point - scissorInFixedPoint.xmin *= FIXED_POINT_SCALE; - scissorInFixedPoint.xmax *= FIXED_POINT_SCALE; - scissorInFixedPoint.ymin *= FIXED_POINT_SCALE; - scissorInFixedPoint.ymax *= FIXED_POINT_SCALE; - - // Make scissor inclusive - scissorInFixedPoint.xmax -= 1; - scissorInFixedPoint.ymax -= 1; - } -} - - -// templated backend function tables - -void SetupPipeline(DRAW_CONTEXT* pDC) -{ - DRAW_STATE* pState = pDC->pState; - const SWR_RASTSTATE& rastState = pState->state.rastState; - const SWR_PS_STATE& psState = pState->state.psState; - BACKEND_FUNCS& backendFuncs = pState->backendFuncs; - - // setup backend - if (psState.pfnPixelShader == nullptr) - { - backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount]; - } - else - { - const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0; - const bool bMultisampleEnable = - ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0; - const uint32_t centroid = - ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; - const uint32_t canEarlyZ = - (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0; - SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; - - // select backend function - switch (psState.shadingRate) - { - case SWR_SHADING_RATE_PIXEL: - if (bMultisampleEnable) - { - // always need to generate I & J per sample for Z interpolation - barycentricsMask = - (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = - gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern] - [psState.inputCoverage][centroid][forcedSampleCount] - [canEarlyZ] - ; - } - else - { - // always need to generate I & J per pixel for Z interpolation - barycentricsMask = - (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); - backendFuncs.pfnBackend = - gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ]; - } - break; - case SWR_SHADING_RATE_SAMPLE: - SWR_ASSERT(rastState.bIsCenterPattern != true); - // always need to generate I & J per sample for Z interpolation - barycentricsMask = - (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = - gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid] - [canEarlyZ]; - break; - default: - SWR_ASSERT(0 && "Invalid shading rate"); - break; - } - } - - SWR_ASSERT(backendFuncs.pfnBackend); - - PFN_PROCESS_PRIMS pfnBinner; -#if USE_SIMD16_FRONTEND - PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16; -#endif - switch (pState->state.topology) - { - case TOP_POINT_LIST: - pState->pfnProcessPrims = ClipPoints; - pfnBinner = BinPoints; -#if USE_SIMD16_FRONTEND - pState->pfnProcessPrims_simd16 = ClipPoints_simd16; - pfnBinner_simd16 = BinPoints_simd16; -#endif - break; - case TOP_LINE_LIST: - case TOP_LINE_STRIP: - case TOP_LINE_LOOP: - case TOP_LINE_LIST_ADJ: - case TOP_LISTSTRIP_ADJ: - pState->pfnProcessPrims = ClipLines; - pfnBinner = BinLines; -#if USE_SIMD16_FRONTEND - pState->pfnProcessPrims_simd16 = ClipLines_simd16; - pfnBinner_simd16 = BinLines_simd16; -#endif - break; - default: - pState->pfnProcessPrims = ClipTriangles; - pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0)); -#if USE_SIMD16_FRONTEND - pState->pfnProcessPrims_simd16 = ClipTriangles_simd16; - pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0)); -#endif - break; - }; - - - // Disable clipper if viewport transform is disabled or if clipper is disabled - if (pState->state.frontendState.vpTransformDisable || !pState->state.rastState.clipEnable) - { - pState->pfnProcessPrims = pfnBinner; -#if USE_SIMD16_FRONTEND - pState->pfnProcessPrims_simd16 = pfnBinner_simd16; -#endif - } - - // Disable rasterizer and backend if no pixel, no depth/stencil, and no attributes - if ((pState->state.psState.pfnPixelShader == nullptr) && - (pState->state.depthStencilState.depthTestEnable == FALSE) && - (pState->state.depthStencilState.depthWriteEnable == FALSE) && - (pState->state.depthStencilState.stencilTestEnable == FALSE) && - (pState->state.depthStencilState.stencilWriteEnable == FALSE) && - (pState->state.backendState.numAttributes == 0)) - { - pState->pfnProcessPrims = nullptr; -#if USE_SIMD16_FRONTEND - pState->pfnProcessPrims_simd16 = nullptr; -#endif - } - - if (pState->state.soState.rasterizerDisable == true) - { - pState->pfnProcessPrims = nullptr; -#if USE_SIMD16_FRONTEND - pState->pfnProcessPrims_simd16 = nullptr; -#endif - } - - - // set up the frontend attribute count - pState->state.feNumAttributes = 0; - const SWR_BACKEND_STATE& backendState = pState->state.backendState; - if (backendState.swizzleEnable) - { - // attribute swizzling is enabled, iterate over the map and record the max attribute used - for (uint32_t i = 0; i < backendState.numAttributes; ++i) - { - pState->state.feNumAttributes = - std::max(pState->state.feNumAttributes, - (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1); - } - } - else - { - pState->state.feNumAttributes = pState->state.backendState.numAttributes; - } - - if (pState->state.soState.soEnable) - { - uint64_t streamMasks = 0; - for (uint32_t i = 0; i < 4; ++i) - { - streamMasks |= pState->state.soState.streamMasks[i]; - } - - unsigned long maxAttrib; - if (_BitScanReverse64(&maxAttrib, streamMasks)) - { - pState->state.feNumAttributes = - std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1)); - } - } - - // complicated logic to test for cases where we don't need backing hottile memory for a draw - // have to check for the special case where depth/stencil test is enabled but depthwrite is - // disabled. - pState->state.depthHottileEnable = - ((!(pState->state.depthStencilState.depthTestEnable && - !pState->state.depthStencilState.depthWriteEnable && - !pState->state.depthBoundsState.depthBoundsTestEnable && - pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && - (pState->state.depthStencilState.depthTestEnable || - pState->state.depthStencilState.depthWriteEnable || - pState->state.depthBoundsState.depthBoundsTestEnable)) - ? true - : false; - - pState->state.stencilHottileEnable = - (((!(pState->state.depthStencilState.stencilTestEnable && - !pState->state.depthStencilState.stencilWriteEnable && - pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) || - // for stencil we have to check the double sided state as well - (!(pState->state.depthStencilState.doubleSidedStencilTestEnable && - !pState->state.depthStencilState.stencilWriteEnable && - pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) && - (pState->state.depthStencilState.stencilTestEnable || - pState->state.depthStencilState.stencilWriteEnable)) - ? true - : false; - - uint32_t hotTileEnable = pState->state.psState.renderTargetMask; - - // Disable hottile for surfaces with no writes - if (psState.pfnPixelShader != nullptr) - { - unsigned long rt; - uint32_t rtMask = pState->state.psState.renderTargetMask; - while (_BitScanForward(&rt, rtMask)) - { - rtMask &= ~(1 << rt); - - if (pState->state.blendState.renderTarget[rt].writeDisableAlpha && - pState->state.blendState.renderTarget[rt].writeDisableRed && - pState->state.blendState.renderTarget[rt].writeDisableGreen && - pState->state.blendState.renderTarget[rt].writeDisableBlue) - { - hotTileEnable &= ~(1 << rt); - } - } - } - - pState->state.colorHottileEnable = hotTileEnable; - - // Setup depth quantization function - if (pState->state.depthHottileEnable) - { - switch (pState->state.rastState.depthFormat) - { - case R32_FLOAT_X8X24_TYPELESS: - pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT_X8X24_TYPELESS>; - break; - case R32_FLOAT: - pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>; - break; - case R24_UNORM_X8_TYPELESS: - pState->state.pfnQuantizeDepth = QuantizeDepth<R24_UNORM_X8_TYPELESS>; - break; - case R16_UNORM: - pState->state.pfnQuantizeDepth = QuantizeDepth<R16_UNORM>; - break; - default: - SWR_INVALID("Unsupported depth format for depth quantization."); - pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>; - } - } - else - { - // set up pass-through quantize if depth isn't enabled - pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>; - } - - // Generate guardbands - updateGuardbands(&pState->state); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief InitDraw -/// @param pDC - Draw context to initialize for this draw. -void InitDraw(DRAW_CONTEXT* pDC, bool isSplitDraw) -{ - // We don't need to re-setup the scissors/pipeline state again for split draw. - if (isSplitDraw == false) - { - SetupMacroTileScissors(pDC); - SetupPipeline(pDC); - } - -} - -////////////////////////////////////////////////////////////////////////// -/// @brief We can split the draw for certain topologies for better performance. -/// @param totalVerts - Total vertices for draw -/// @param topology - Topology used for draw -uint32_t MaxVertsPerDraw(DRAW_CONTEXT* pDC, uint32_t totalVerts, PRIMITIVE_TOPOLOGY topology) -{ - API_STATE& state = pDC->pState->state; - - // We can not split draws that have streamout enabled because there is no practical way - // to support multiple threads generating SO data for a single set of buffers. - if (state.soState.soEnable) - { - return totalVerts; - } - - // The Primitive Assembly code can only handle 1 RECT at a time. Specified with only 3 verts. - if (topology == TOP_RECT_LIST) - { - return 3; - } - - // Is split drawing disabled? - if (KNOB_DISABLE_SPLIT_DRAW) - { - return totalVerts; - } - - uint32_t vertsPerDraw = totalVerts; - - switch (topology) - { - case TOP_POINT_LIST: - case TOP_TRIANGLE_LIST: - vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW; - break; - - case TOP_PATCHLIST_1: - case TOP_PATCHLIST_2: - case TOP_PATCHLIST_3: - case TOP_PATCHLIST_4: - case TOP_PATCHLIST_5: - case TOP_PATCHLIST_6: - case TOP_PATCHLIST_7: - case TOP_PATCHLIST_8: - case TOP_PATCHLIST_9: - case TOP_PATCHLIST_10: - case TOP_PATCHLIST_11: - case TOP_PATCHLIST_12: - case TOP_PATCHLIST_13: - case TOP_PATCHLIST_14: - case TOP_PATCHLIST_15: - case TOP_PATCHLIST_16: - case TOP_PATCHLIST_17: - case TOP_PATCHLIST_18: - case TOP_PATCHLIST_19: - case TOP_PATCHLIST_20: - case TOP_PATCHLIST_21: - case TOP_PATCHLIST_22: - case TOP_PATCHLIST_23: - case TOP_PATCHLIST_24: - case TOP_PATCHLIST_25: - case TOP_PATCHLIST_26: - case TOP_PATCHLIST_27: - case TOP_PATCHLIST_28: - case TOP_PATCHLIST_29: - case TOP_PATCHLIST_30: - case TOP_PATCHLIST_31: - case TOP_PATCHLIST_32: - if (pDC->pState->state.tsState.tsEnable) - { - uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE; - vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW; - } - break; - default: - // We are not splitting up draws for other topologies. - break; - } - - return vertsPerDraw; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief DrawInstanced -/// @param hContext - Handle passed back from SwrCreateContext -/// @param topology - Specifies topology for draw. -/// @param numVerts - How many vertices to read sequentially from vertex data (per instance). -/// @param startVertex - Specifies start vertex for draw. (vertex data) -/// @param numInstances - How many instances to render. -/// @param startInstance - Which instance to start sequentially fetching from in each buffer -/// (instanced data) -void DrawInstanced(HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numVertices, - uint32_t startVertex, - uint32_t numInstances = 1, - uint32_t startInstance = 0) -{ - if (KNOB_TOSS_DRAW) - { - return; - } - - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - RDTSC_BEGIN(pContext->pBucketMgr, APIDraw, pDC->drawId); - - uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); - uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); - uint32_t remainingVerts = numVertices; - - API_STATE* pState = &pDC->pState->state; - pState->topology = topology; - pState->forceFront = false; - - // disable culling for points/lines - uint32_t oldCullMode = pState->rastState.cullMode; - if (topology == TOP_POINT_LIST) - { - pState->rastState.cullMode = SWR_CULLMODE_NONE; - pState->forceFront = true; - } - else if (topology == TOP_RECT_LIST) - { - pState->rastState.cullMode = SWR_CULLMODE_NONE; - } - - int draw = 0; - while (remainingVerts) - { - uint32_t numVertsForDraw = - (remainingVerts < maxVertsPerDraw) ? remainingVerts : maxVertsPerDraw; - - bool isSplitDraw = (draw > 0) ? !KNOB_DISABLE_SPLIT_DRAW : false; - DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw); - InitDraw(pDC, isSplitDraw); - - pDC->FeWork.type = DRAW; - pDC->FeWork.pfnWork = GetProcessDrawFunc(false, // IsIndexed - false, // bEnableCutIndex - pState->tsState.tsEnable, - pState->gsState.gsEnable, - pState->soState.soEnable, - pDC->pState->pfnProcessPrims != nullptr); - pDC->FeWork.desc.draw.numVerts = numVertsForDraw; - pDC->FeWork.desc.draw.startVertex = startVertex; - pDC->FeWork.desc.draw.numInstances = numInstances; - pDC->FeWork.desc.draw.startInstance = startInstance; - pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; - pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw; - - pDC->cleanupState = (remainingVerts == numVertsForDraw); - - // enqueue DC - QueueDraw(pContext); - - AR_API_EVENT(DrawInstancedEvent(pDC->drawId, - topology, - numVertsForDraw, - startVertex, - numInstances, - startInstance, - pState->tsState.tsEnable, - pState->gsState.gsEnable, - pState->soState.soEnable, - pState->gsState.outputTopology, - draw)); - - remainingVerts -= numVertsForDraw; - draw++; - } - - // restore culling state - pDC = GetDrawContext(pContext); - pDC->pState->state.rastState.cullMode = oldCullMode; - - RDTSC_END(pContext->pBucketMgr, APIDraw, numVertices * numInstances); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrDraw -/// @param hContext - Handle passed back from SwrCreateContext -/// @param topology - Specifies topology for draw. -/// @param startVertex - Specifies start vertex in vertex buffer for draw. -/// @param primCount - Number of vertices. -void SwrDraw(HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t startVertex, - uint32_t numVertices) -{ - DrawInstanced(hContext, topology, numVertices, startVertex); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrDrawInstanced -/// @param hContext - Handle passed back from SwrCreateContext -/// @param topology - Specifies topology for draw. -/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. -/// @param numInstances - How many instances to render. -/// @param startVertex - Specifies start vertex for draw. (vertex data) -/// @param startInstance - Which instance to start sequentially fetching from in each buffer -/// (instanced data) -void SwrDrawInstanced(HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numVertsPerInstance, - uint32_t numInstances, - uint32_t startVertex, - uint32_t startInstance) -{ - DrawInstanced( - hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief DrawIndexedInstanced -/// @param hContext - Handle passed back from SwrCreateContext -/// @param topology - Specifies topology for draw. -/// @param numIndices - Number of indices to read sequentially from index buffer. -/// @param indexOffset - Starting index into index buffer. -/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -/// @param numInstances - Number of instances to render. -/// @param startInstance - Which instance to start sequentially fetching from in each buffer -/// (instanced data) -void DrawIndexedInstance(HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numIndices, - uint32_t indexOffset, - int32_t baseVertex, - uint32_t numInstances = 1, - uint32_t startInstance = 0) -{ - if (KNOB_TOSS_DRAW) - { - return; - } - - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - API_STATE* pState = &pDC->pState->state; - - RDTSC_BEGIN(pContext->pBucketMgr, APIDrawIndexed, pDC->drawId); - - uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); - uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); - uint32_t remainingIndices = numIndices; - - uint32_t indexSize = 0; - switch (pState->indexBuffer.format) - { - case R32_UINT: - indexSize = sizeof(uint32_t); - break; - case R16_UINT: - indexSize = sizeof(uint16_t); - break; - case R8_UINT: - indexSize = sizeof(uint8_t); - break; - default: - SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format); - } - - int draw = 0; - gfxptr_t xpIB = pState->indexBuffer.xpIndices; - xpIB += (uint64_t)indexOffset * (uint64_t)indexSize; - - pState->topology = topology; - pState->forceFront = false; - - // disable culling for points/lines - uint32_t oldCullMode = pState->rastState.cullMode; - if (topology == TOP_POINT_LIST) - { - pState->rastState.cullMode = SWR_CULLMODE_NONE; - pState->forceFront = true; - } - else if (topology == TOP_RECT_LIST) - { - pState->rastState.cullMode = SWR_CULLMODE_NONE; - } - - while (remainingIndices) - { - uint32_t numIndicesForDraw = - (remainingIndices < maxIndicesPerDraw) ? remainingIndices : maxIndicesPerDraw; - - // When breaking up draw, we need to obtain new draw context for each iteration. - bool isSplitDraw = (draw > 0) ? !KNOB_DISABLE_SPLIT_DRAW : false; - - pDC = GetDrawContext(pContext, isSplitDraw); - InitDraw(pDC, isSplitDraw); - - pDC->FeWork.type = DRAW; - pDC->FeWork.pfnWork = GetProcessDrawFunc(true, // IsIndexed - pState->frontendState.bEnableCutIndex, - pState->tsState.tsEnable, - pState->gsState.gsEnable, - pState->soState.soEnable, - pDC->pState->pfnProcessPrims != nullptr); - pDC->FeWork.desc.draw.pDC = pDC; - pDC->FeWork.desc.draw.numIndices = numIndicesForDraw; - pDC->FeWork.desc.draw.xpIB = xpIB; - pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; - - pDC->FeWork.desc.draw.numInstances = numInstances; - pDC->FeWork.desc.draw.startInstance = startInstance; - pDC->FeWork.desc.draw.baseVertex = baseVertex; - pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; - - pDC->cleanupState = (remainingIndices == numIndicesForDraw); - - // enqueue DC - QueueDraw(pContext); - - AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, - topology, - numIndicesForDraw, - indexOffset, - baseVertex, - numInstances, - startInstance, - pState->tsState.tsEnable, - pState->gsState.gsEnable, - pState->soState.soEnable, - pState->gsState.outputTopology, - draw)); - - xpIB += maxIndicesPerDraw * indexSize; - remainingIndices -= numIndicesForDraw; - draw++; - } - - // Restore culling state - pDC = GetDrawContext(pContext); - pDC->pState->state.rastState.cullMode = oldCullMode; - - RDTSC_END(pContext->pBucketMgr, APIDrawIndexed, numIndices * numInstances); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief DrawIndexed -/// @param hContext - Handle passed back from SwrCreateContext -/// @param topology - Specifies topology for draw. -/// @param numIndices - Number of indices to read sequentially from index buffer. -/// @param indexOffset - Starting index into index buffer. -/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -void SwrDrawIndexed(HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numIndices, - uint32_t indexOffset, - int32_t baseVertex) -{ - DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrDrawIndexedInstanced -/// @param hContext - Handle passed back from SwrCreateContext -/// @param topology - Specifies topology for draw. -/// @param numIndices - Number of indices to read sequentially from index buffer. -/// @param numInstances - Number of instances to render. -/// @param indexOffset - Starting index into index buffer. -/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -/// @param startInstance - Which instance to start sequentially fetching from in each buffer -/// (instanced data) -void SwrDrawIndexedInstanced(HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numIndices, - uint32_t numInstances, - uint32_t indexOffset, - int32_t baseVertex, - uint32_t startInstance) -{ - DrawIndexedInstance( - hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrInvalidateTiles -/// @param hContext - Handle passed back from SwrCreateContext -/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to -/// invalidate. -/// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to -/// be hottile size-aligned. -void SWR_API SwrInvalidateTiles(HANDLE hContext, - uint32_t attachmentMask, - const SWR_RECT& invalidateRect) -{ - if (KNOB_TOSS_DRAW) - { - return; - } - - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - pDC->FeWork.type = DISCARDINVALIDATETILES; - pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; - pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; - pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect; - pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect; - pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID; - pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false; - pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false; - - // enqueue - QueueDraw(pContext); - - AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId)); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrDiscardRect -/// @param hContext - Handle passed back from SwrCreateContext -/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. -/// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be -/// discarded. -void SWR_API SwrDiscardRect(HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect) -{ - if (KNOB_TOSS_DRAW) - { - return; - } - - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - // Queue a load to the hottile - pDC->FeWork.type = DISCARDINVALIDATETILES; - pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; - pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; - pDC->FeWork.desc.discardInvalidateTiles.rect = rect; - pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect; - pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED; - pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true; - pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true; - - // enqueue - QueueDraw(pContext); - - AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId)); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrDispatch -/// @param hContext - Handle passed back from SwrCreateContext -/// @param threadGroupCountX - Number of thread groups dispatched in X direction -/// @param threadGroupCountY - Number of thread groups dispatched in Y direction -/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction -void SwrDispatch(HANDLE hContext, - uint32_t threadGroupCountX, - uint32_t threadGroupCountY, - uint32_t threadGroupCountZ - -) -{ - if (KNOB_TOSS_DRAW) - { - return; - } - - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - RDTSC_BEGIN(pContext->pBucketMgr, APIDispatch, pDC->drawId); - AR_API_EVENT( - DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ)); - pDC->isCompute = true; // This is a compute context. - - COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64); - - pTaskData->threadGroupCountX = threadGroupCountX; - pTaskData->threadGroupCountY = threadGroupCountY; - pTaskData->threadGroupCountZ = threadGroupCountZ; - - pTaskData->enableThreadDispatch = false; - - uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; - uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT; - pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex]; - pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE); - - QueueDispatch(pContext); - RDTSC_END(pContext->pBucketMgr, - APIDispatch, - threadGroupCountX * threadGroupCountY * threadGroupCountZ); -} - -// Deswizzles, converts and stores current contents of the hot tiles to surface -// described by pState -void SWR_API SwrStoreTiles(HANDLE hContext, - uint32_t attachmentMask, - SWR_TILE_STATE postStoreTileState, - const SWR_RECT& storeRect) -{ - if (KNOB_TOSS_DRAW) - { - return; - } - - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - RDTSC_BEGIN(pContext->pBucketMgr, APIStoreTiles, pDC->drawId); - - pDC->FeWork.type = STORETILES; - pDC->FeWork.pfnWork = ProcessStoreTiles; - pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask; - pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState; - pDC->FeWork.desc.storeTiles.rect = storeRect; - pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect; - - // enqueue - QueueDraw(pContext); - - AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId)); - - RDTSC_END(pContext->pBucketMgr, APIStoreTiles, 1); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil -/// @param hContext - Handle passed back from SwrCreateContext -/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear -/// @param renderTargetArrayIndex - the RT array index to clear -/// @param clearColor - color use for clearing render targets -/// @param z - depth value use for clearing depth buffer -/// @param stencil - stencil value used for clearing stencil buffer -/// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers -void SWR_API SwrClearRenderTarget(HANDLE hContext, - uint32_t attachmentMask, - uint32_t renderTargetArrayIndex, - const float clearColor[4], - float z, - uint8_t stencil, - const SWR_RECT& clearRect) -{ - if (KNOB_TOSS_DRAW) - { - return; - } - - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - RDTSC_BEGIN(pContext->pBucketMgr, APIClearRenderTarget, pDC->drawId); - - pDC->FeWork.type = CLEAR; - pDC->FeWork.pfnWork = ProcessClear; - pDC->FeWork.desc.clear.rect = clearRect; - pDC->FeWork.desc.clear.rect &= g_MaxScissorRect; - pDC->FeWork.desc.clear.attachmentMask = attachmentMask; - pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex; - pDC->FeWork.desc.clear.clearDepth = z; - pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; - pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; - pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2]; - pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3]; - pDC->FeWork.desc.clear.clearStencil = stencil; - - // enqueue draw - QueueDraw(pContext); - - RDTSC_END(pContext->pBucketMgr, APIClearRenderTarget, 1); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Returns a pointer to the private context state for the current -/// draw operation. This is used for external componets such as the -/// sampler. -/// SWR is responsible for the allocation of the private context state. -/// @param hContext - Handle passed back from SwrCreateContext -VOID* SwrGetPrivateContextState(HANDLE hContext) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - DRAW_STATE* pState = pDC->pState; - - if (pState->pPrivateState == nullptr) - { - pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, - KNOB_SIMD_WIDTH * sizeof(float)); - } - - return pState->pPrivateState; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Clients can use this to allocate memory for draw/dispatch -/// operations. The memory will automatically be freed once operation -/// has completed. Client can use this to allocate binding tables, -/// etc. needed for shader execution. -/// @param hContext - Handle passed back from SwrCreateContext -/// @param size - Size of allocation -/// @param align - Alignment needed for allocation. -VOID* SwrAllocDrawContextMemory(HANDLE hContext, uint32_t size, uint32_t align) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - return pDC->pState->pArena->AllocAligned(size, align); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Enables stats counting -/// @param hContext - Handle passed back from SwrCreateContext -/// @param enable - If true then counts are incremented. -void SwrEnableStatsFE(HANDLE hContext, bool enable) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - pDC->pState->state.enableStatsFE = enable; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Enables stats counting -/// @param hContext - Handle passed back from SwrCreateContext -/// @param enable - If true then counts are incremented. -void SwrEnableStatsBE(HANDLE hContext, bool enable) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - - pDC->pState->state.enableStatsBE = enable; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Mark end of frame - used for performance profiling -/// @param hContext - Handle passed back from SwrCreateContext -void SWR_API SwrEndFrame(HANDLE hContext) -{ - SWR_CONTEXT* pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - (void)pDC; // var used - - RDTSC_ENDFRAME(pContext->pBucketMgr); - AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId)); - - pContext->frameCount++; -} - -void InitSimLoadTilesTable(); -void InitSimStoreTilesTable(); -void InitSimClearTilesTable(); - -void InitClearTilesTable(); -void InitBackendFuncTables(); - -////////////////////////////////////////////////////////////////////////// -/// @brief Initialize swr backend and memory internal tables -void SwrInit() -{ - InitClearTilesTable(); - InitBackendFuncTables(); - InitRasterizerFunctions(); -} - -void SwrGetInterface(SWR_INTERFACE& out_funcs) -{ - out_funcs.pfnSwrCreateContext = SwrCreateContext; - out_funcs.pfnSwrDestroyContext = SwrDestroyContext; - out_funcs.pfnSwrBindApiThread = SwrBindApiThread; - out_funcs.pfnSwrSaveState = SwrSaveState; - out_funcs.pfnSwrRestoreState = SwrRestoreState; - out_funcs.pfnSwrSync = SwrSync; - out_funcs.pfnSwrStallBE = SwrStallBE; - out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle; - out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE; - out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers; - out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer; - out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc; - out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc; - out_funcs.pfnSwrSetSoState = SwrSetSoState; - out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers; - out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc; - out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState; - out_funcs.pfnSwrSetGsState = SwrSetGsState; - out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc; - out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc; - out_funcs.pfnSwrSetTsState = SwrSetTsState; - out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc; - out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc; - out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState; - out_funcs.pfnSwrSetBackendState = SwrSetBackendState; - out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState; - out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState; - out_funcs.pfnSwrSetBlendState = SwrSetBlendState; - out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc; - out_funcs.pfnSwrDraw = SwrDraw; - out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced; - out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed; - out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced; - out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles; - out_funcs.pfnSwrDiscardRect = SwrDiscardRect; - out_funcs.pfnSwrDispatch = SwrDispatch; - out_funcs.pfnSwrStoreTiles = SwrStoreTiles; - out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget; - out_funcs.pfnSwrSetRastState = SwrSetRastState; - out_funcs.pfnSwrSetViewports = SwrSetViewports; - out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects; - out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState; - out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory; - out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE; - out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE; - out_funcs.pfnSwrEndFrame = SwrEndFrame; - out_funcs.pfnSwrInit = SwrInit; -} diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h deleted file mode 100644 index 79e33b01677..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ /dev/null @@ -1,772 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file api.h - * - * @brief API definitions - * - ******************************************************************************/ - -#ifndef __SWR_API_H__ -#define __SWR_API_H__ - -#include "common/os.h" - -#include <assert.h> -#include <algorithm> - -#include "common/intrin.h" -#include "common/formats.h" -#include "core/state.h" - -typedef void(SWR_API* PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3); - -////////////////////////////////////////////////////////////////////////// -/// @brief Rectangle structure -struct SWR_RECT -{ - int32_t xmin; ///< inclusive - int32_t ymin; ///< inclusive - int32_t xmax; ///< exclusive - int32_t ymax; ///< exclusive - - bool operator==(const SWR_RECT& rhs) - { - return (this->ymin == rhs.ymin && this->ymax == rhs.ymax && this->xmin == rhs.xmin && - this->xmax == rhs.xmax); - } - - bool operator!=(const SWR_RECT& rhs) { return !(*this == rhs); } - - SWR_RECT& Intersect(const SWR_RECT& other) - { - this->xmin = std::max(this->xmin, other.xmin); - this->ymin = std::max(this->ymin, other.ymin); - this->xmax = std::min(this->xmax, other.xmax); - this->ymax = std::min(this->ymax, other.ymax); - - if (xmax - xmin < 0 || ymax - ymin < 0) - { - // Zero area - ymin = ymax = xmin = xmax = 0; - } - - return *this; - } - SWR_RECT& operator&=(const SWR_RECT& other) { return Intersect(other); } - - SWR_RECT& Union(const SWR_RECT& other) - { - this->xmin = std::min(this->xmin, other.xmin); - this->ymin = std::min(this->ymin, other.ymin); - this->xmax = std::max(this->xmax, other.xmax); - this->ymax = std::max(this->ymax, other.ymax); - - return *this; - } - - SWR_RECT& operator|=(const SWR_RECT& other) { return Union(other); } - - void Translate(int32_t x, int32_t y) - { - xmin += x; - ymin += y; - xmax += x; - ymax += y; - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Function signature for load hot tiles -/// @param hDC - handle to DRAW_CONTEXT -/// @param dstFormat - format of the hot tile -/// @param renderTargetIndex - render target to store, can be color, depth or stencil -/// @param x - destination x coordinate -/// @param y - destination y coordinate -/// @param pDstHotTile - pointer to the hot tile surface -typedef void(SWR_API* PFN_LOAD_TILE)(HANDLE hDC, - HANDLE hWorkerPrivateData, - SWR_FORMAT dstFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, - uint32_t y, - uint32_t renderTargetArrayIndex, - uint8_t* pDstHotTile); - -////////////////////////////////////////////////////////////////////////// -/// @brief Function signature for store hot tiles -/// @param hDC - handle to DRAW_CONTEXT -/// @param srcFormat - format of the hot tile -/// @param renderTargetIndex - render target to store, can be color, depth or stencil -/// @param x - destination x coordinate -/// @param y - destination y coordinate -/// @param pSrcHotTile - pointer to the hot tile surface -typedef void(SWR_API* PFN_STORE_TILE)(HANDLE hDC, - HANDLE hWorkerPrivateData, - SWR_FORMAT srcFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, - uint32_t y, - uint32_t renderTargetArrayIndex, - uint8_t* pSrcHotTile); - -////////////////////////////////////////////////////////////////////////// -/// @brief Function signature for clearing from the hot tiles clear value -/// @param hPrivateContext - handle to private data -/// @param renderTargetIndex - render target to store, can be color, depth or stencil -/// @param x - destination x coordinate -/// @param y - destination y coordinate -/// @param renderTargetArrayIndex - render target array offset from arrayIndex -/// @param pClearColor - pointer to the hot tile's clear value -typedef void(SWR_API* PFN_CLEAR_TILE)(HANDLE hPrivateContext, - HANDLE hWorkerPrivateData, - SWR_RENDERTARGET_ATTACHMENT rtIndex, - uint32_t x, - uint32_t y, - uint32_t renderTargetArrayIndex, - const float* pClearColor); - -typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_READ)(HANDLE hPrivateContext, - gfxptr_t xpAddr, - bool* pbNullTileAccessed, - HANDLE hPrivateWorkerData); - -typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_WRITE)(HANDLE hPrivateContext, - gfxptr_t xpAddr, - bool* pbNullTileAccessed, - HANDLE hPrivateWorkerData); - -typedef gfxptr_t(SWR_API* PFN_MAKE_GFXPTR)(HANDLE hPrivateContext, void* sysAddr); - -typedef HANDLE(SWR_API* PFN_CREATE_MEMORY_CONTEXT)(HANDLE hExternalMemory); - -typedef void(SWR_API* PFN_DESTROY_MEMORY_CONTEXT)(HANDLE hExternalMemory, HANDLE hMemoryContext); - -////////////////////////////////////////////////////////////////////////// -/// @brief Callback to allow driver to update their copy of streamout write offset. -/// This is call is made for any draw operation that has streamout enabled -/// and has updated the write offset. -/// @param hPrivateContext - handle to private data -/// @param soBufferSlot - buffer slot for write offset -/// @param soWriteOffset - update value for so write offset. -typedef void(SWR_API* PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext, - uint32_t soBufferSlot, - uint32_t soWriteOffset); - -////////////////////////////////////////////////////////////////////////// -/// @brief Callback to allow driver to update their copy of stats. -/// @param hPrivateContext - handle to private data -/// @param pStats - pointer to draw stats -typedef void(SWR_API* PFN_UPDATE_STATS)(HANDLE hPrivateContext, const SWR_STATS* pStats); - -////////////////////////////////////////////////////////////////////////// -/// @brief Callback to allow driver to update their copy of FE stats. -/// @note Its optimal to have a separate callback for FE stats since -/// there is only one DC per FE thread. This means we do not have -/// to sum up the stats across all of the workers. -/// @param hPrivateContext - handle to private data -/// @param pStats - pointer to draw stats -typedef void(SWR_API* PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext, const SWR_STATS_FE* pStats); - -////////////////////////////////////////////////////////////////////////// -/// @brief Callback to allow driver to update StreamOut status -/// @param hPrivateContext - handle to private data -/// @param numPrims - number of primitives written to StreamOut buffer -typedef void(SWR_API* PFN_UPDATE_STREAMOUT)(HANDLE hPrivateContext, uint64_t numPrims); - -////////////////////////////////////////////////////////////////////////// -/// BucketManager -/// Forward Declaration (see rdtsc_buckets.h for full definition) -///////////////////////////////////////////////////////////////////////// -class BucketManager; - -////////////////////////////////////////////////////////////////////////// -/// SWR_THREADING_INFO -///////////////////////////////////////////////////////////////////////// -struct SWR_THREADING_INFO -{ - uint32_t BASE_NUMA_NODE; - uint32_t BASE_CORE; - uint32_t BASE_THREAD; - uint32_t MAX_WORKER_THREADS; - uint32_t MAX_NUMA_NODES; - uint32_t MAX_CORES_PER_NUMA_NODE; - uint32_t MAX_THREADS_PER_CORE; - bool SINGLE_THREADED; -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_API_THREADING_INFO -/// Data used to reserve HW threads for API use -/// API Threads are reserved from numa nodes / cores used for -/// SWR Worker threads. Specifying reserved threads here can reduce -/// the total number of SWR worker threads. -///////////////////////////////////////////////////////////////////////// -struct SWR_API_THREADING_INFO -{ - uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent - uint32_t bindAPIThread0; // Default is true if numAPIReservedThreads is > 0, - // binds thread used in SwrCreateContext to API Reserved - // thread 0 - uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number. - // Independent of KNOB_MAX_THREADS_PER_CORE. -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_CONTEXT -/// Forward Declaration (see context.h for full definition) -///////////////////////////////////////////////////////////////////////// -struct SWR_CONTEXT; - -////////////////////////////////////////////////////////////////////////// -/// SWR_WORKER_PRIVATE_STATE -/// Data used to allocate per-worker thread private data. A pointer -/// to this data will be passed in to each shader function. -/// The first field of this private data must be SWR_WORKER_DATA -/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA -///////////////////////////////////////////////////////////////////////// -struct SWR_WORKER_PRIVATE_STATE -{ - typedef void(SWR_API* PFN_WORKER_DATA)(SWR_CONTEXT* pContext, HANDLE hWorkerPrivateData, uint32_t iWorkerNum); - - size_t perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker - PFN_WORKER_DATA pfnInitWorkerData; ///< Init function for worker data. If null - ///< worker data will be initialized to 0. - PFN_WORKER_DATA pfnFinishWorkerData; ///< Finish / destroy function for worker data. - ///< Can be null. -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_CREATECONTEXT_INFO -///////////////////////////////////////////////////////////////////////// -struct SWR_CREATECONTEXT_INFO -{ - // External functions (e.g. sampler) need per draw context state. - // Use SwrGetPrivateContextState() to access private state. - size_t privateStateSize; - - // Optional per-worker state, can be NULL for no worker-private data - SWR_WORKER_PRIVATE_STATE* pWorkerPrivateState; - - // Callback functions - PFN_LOAD_TILE pfnLoadTile; - PFN_STORE_TILE pfnStoreTile; - PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead; - PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite; - PFN_MAKE_GFXPTR pfnMakeGfxPtr; - PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext; - PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext; - PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; - PFN_UPDATE_STATS pfnUpdateStats; - PFN_UPDATE_STATS_FE pfnUpdateStatsFE; - PFN_UPDATE_STREAMOUT pfnUpdateStreamOut; - - - // Pointer to rdtsc buckets mgr returned to the caller. - // Only populated when KNOB_ENABLE_RDTSC is set - BucketManager* pBucketMgr; - - // Output: size required memory passed to for SwrSaveState / SwrRestoreState - size_t contextSaveSize; - - // ArchRast event manager. - HANDLE hArEventManager; - - // handle to external memory for worker data to create memory contexts - HANDLE hExternalMemory; - - // Input (optional): Threading info that overrides any set KNOB values. - SWR_THREADING_INFO* pThreadInfo; - - // Input (optional): Info for reserving API threads - SWR_API_THREADING_INFO* pApiThreadInfo; - - // Input: if set to non-zero value, overrides KNOB value for maximum - // number of draws in flight - uint32_t MAX_DRAWS_IN_FLIGHT; - - std::string contextName; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Create SWR Context. -/// @param pCreateInfo - pointer to creation info. -SWR_FUNC(HANDLE, SwrCreateContext, SWR_CREATECONTEXT_INFO* pCreateInfo); - -////////////////////////////////////////////////////////////////////////// -/// @brief Destroys SWR Context. -/// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void, SwrDestroyContext, HANDLE hContext); - -////////////////////////////////////////////////////////////////////////// -/// @brief Bind current thread to an API reserved HW thread -/// @param hContext - Handle passed back from SwrCreateContext -/// @param apiThreadId - index of reserved HW thread to bind to. -SWR_FUNC(void, SwrBindApiThread, HANDLE hContext, uint32_t apiThreadId); - -////////////////////////////////////////////////////////////////////////// -/// @brief Saves API state associated with hContext -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pOutputStateBlock - Memory block to receive API state data -/// @param memSize - Size of memory pointed to by pOutputStateBlock -SWR_FUNC(void, SwrSaveState, HANDLE hContext, void* pOutputStateBlock, size_t memSize); - -////////////////////////////////////////////////////////////////////////// -/// @brief Restores API state to hContext previously saved with SwrSaveState -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pStateBlock - Memory block to read API state data from -/// @param memSize - Size of memory pointed to by pStateBlock -SWR_FUNC(void, SwrRestoreState, HANDLE hContext, const void* pStateBlock, size_t memSize); - -////////////////////////////////////////////////////////////////////////// -/// @brief Sync cmd. Executes the callback func when all rendering up to this sync -/// has been completed -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pfnFunc - pointer to callback function, -/// @param userData - user data to pass back -SWR_FUNC(void, - SwrSync, - HANDLE hContext, - PFN_CALLBACK_FUNC pfnFunc, - uint64_t userData, - uint64_t userData2, - uint64_t userData3); - -////////////////////////////////////////////////////////////////////////// -/// @brief Stall cmd. Stalls the backend until all previous work has been completed. -/// Frontend work can continue to make progress -/// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void, SwrStallBE, HANDLE hContext); - -////////////////////////////////////////////////////////////////////////// -/// @brief Blocks until all rendering has been completed. -/// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void, SwrWaitForIdle, HANDLE hContext); - -////////////////////////////////////////////////////////////////////////// -/// @brief Blocks until all FE rendering has been completed. -/// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void, SwrWaitForIdleFE, HANDLE hContext); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set vertex buffer state. -/// @param hContext - Handle passed back from SwrCreateContext -/// @param numBuffers - Number of vertex buffer state descriptors. -/// @param pVertexBuffers - Array of vertex buffer state descriptors. -SWR_FUNC(void, - SwrSetVertexBuffers, - HANDLE hContext, - uint32_t numBuffers, - const SWR_VERTEX_BUFFER_STATE* pVertexBuffers); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set index buffer -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pIndexBuffer - Index buffer. -SWR_FUNC(void, SwrSetIndexBuffer, HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set fetch shader pointer. -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pfnFetchFunc - Pointer to shader. -SWR_FUNC(void, SwrSetFetchFunc, HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set streamout shader pointer. -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pfnSoFunc - Pointer to shader. -/// @param streamIndex - specifies stream -SWR_FUNC(void, SwrSetSoFunc, HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set streamout state -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pSoState - Pointer to streamout state. -SWR_FUNC(void, SwrSetSoState, HANDLE hContext, SWR_STREAMOUT_STATE* pSoState); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set streamout buffer state -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pSoBuffer - Pointer to streamout buffer. -/// @param slot - Slot to bind SO buffer to. -SWR_FUNC(void, SwrSetSoBuffers, HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set vertex shader pointer. -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pfnVertexFunc - Pointer to shader. -SWR_FUNC(void, SwrSetVertexFunc, HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set frontend state. -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pState - Pointer to state -SWR_FUNC(void, SwrSetFrontendState, HANDLE hContext, SWR_FRONTEND_STATE* pState); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set geometry shader state. -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pState - Pointer to state -SWR_FUNC(void, SwrSetGsState, HANDLE hContext, SWR_GS_STATE* pState); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set geometry shader -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pState - Pointer to geometry shader function -SWR_FUNC(void, SwrSetGsFunc, HANDLE hContext, PFN_GS_FUNC pfnGsFunc); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set compute shader -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pfnCsFunc - Pointer to compute shader function -/// @param totalThreadsInGroup - product of thread group dimensions. -/// @param totalSpillFillSize - size in bytes needed for spill/fill. -/// @param scratchSpaceSizePerInstance - size of the scratch space needed per simd instance -/// @param numInstances - number of simd instances that are run per execution of the shader -SWR_FUNC(void, - SwrSetCsFunc, - HANDLE hContext, - PFN_CS_FUNC pfnCsFunc, - uint32_t totalThreadsInGroup, - uint32_t totalSpillFillSize, - uint32_t scratchSpaceSizePerInstance, - uint32_t numInstances); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set tessellation state. -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pState - Pointer to state -SWR_FUNC(void, SwrSetTsState, HANDLE hContext, SWR_TS_STATE* pState); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set hull shader -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pfnFunc - Pointer to shader function -SWR_FUNC(void, SwrSetHsFunc, HANDLE hContext, PFN_HS_FUNC pfnFunc); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set domain shader -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pfnFunc - Pointer to shader function -SWR_FUNC(void, SwrSetDsFunc, HANDLE hContext, PFN_DS_FUNC pfnFunc); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set depth stencil state -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pState - Pointer to state. -SWR_FUNC(void, SwrSetDepthStencilState, HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pState); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set backend state -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pState - Pointer to state. -SWR_FUNC(void, SwrSetBackendState, HANDLE hContext, SWR_BACKEND_STATE* pState); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set depth bounds state -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pState - Pointer to state. -SWR_FUNC(void, SwrSetDepthBoundsState, HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pState); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set pixel shader state -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pState - Pointer to state. -SWR_FUNC(void, SwrSetPixelShaderState, HANDLE hContext, SWR_PS_STATE* pState); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set blend state -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pState - Pointer to state. -SWR_FUNC(void, SwrSetBlendState, HANDLE hContext, SWR_BLEND_STATE* pState); - -////////////////////////////////////////////////////////////////////////// -/// @brief Set blend function -/// @param hContext - Handle passed back from SwrCreateContext -/// @param renderTarget - render target index -/// @param pfnBlendFunc - function pointer -SWR_FUNC( - void, SwrSetBlendFunc, HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc); - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrDraw -/// @param hContext - Handle passed back from SwrCreateContext -/// @param topology - Specifies topology for draw. -/// @param startVertex - Specifies start vertex in vertex buffer for draw. -/// @param primCount - Number of vertices. -SWR_FUNC(void, - SwrDraw, - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t startVertex, - uint32_t primCount); - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrDrawInstanced -/// @param hContext - Handle passed back from SwrCreateContext -/// @param topology - Specifies topology for draw. -/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. -/// @param numInstances - How many instances to render. -/// @param startVertex - Specifies start vertex for draw. (vertex data) -/// @param startInstance - Which instance to start sequentially fetching from in each buffer -/// (instanced data) -SWR_FUNC(void, - SwrDrawInstanced, - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numVertsPerInstance, - uint32_t numInstances, - uint32_t startVertex, - uint32_t startInstance); - -////////////////////////////////////////////////////////////////////////// -/// @brief DrawIndexed -/// @param hContext - Handle passed back from SwrCreateContext -/// @param topology - Specifies topology for draw. -/// @param numIndices - Number of indices to read sequentially from index buffer. -/// @param indexOffset - Starting index into index buffer. -/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -SWR_FUNC(void, - SwrDrawIndexed, - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numIndices, - uint32_t indexOffset, - int32_t baseVertex); - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrDrawIndexedInstanced -/// @param hContext - Handle passed back from SwrCreateContext -/// @param topology - Specifies topology for draw. -/// @param numIndices - Number of indices to read sequentially from index buffer. -/// @param numInstances - Number of instances to render. -/// @param indexOffset - Starting index into index buffer. -/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -/// @param startInstance - Which instance to start sequentially fetching from in each buffer -/// (instanced data) -SWR_FUNC(void, - SwrDrawIndexedInstanced, - HANDLE hContext, - PRIMITIVE_TOPOLOGY topology, - uint32_t numIndices, - uint32_t numInstances, - uint32_t indexOffset, - int32_t baseVertex, - uint32_t startInstance); - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrInvalidateTiles -/// @param hContext - Handle passed back from SwrCreateContext -/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to -/// invalidate. -/// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to -/// be hottile size-aligned. -SWR_FUNC(void, - SwrInvalidateTiles, - HANDLE hContext, - uint32_t attachmentMask, - const SWR_RECT& invalidateRect); - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrDiscardRect -/// @param hContext - Handle passed back from SwrCreateContext -/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. -/// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be -/// discarded. -SWR_FUNC(void, SwrDiscardRect, HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect); - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrDispatch -/// @param hContext - Handle passed back from SwrCreateContext -/// @param threadGroupCountX - Number of thread groups dispatched in X direction -/// @param threadGroupCountY - Number of thread groups dispatched in Y direction -/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction -SWR_FUNC(void, - SwrDispatch, - HANDLE hContext, - uint32_t threadGroupCountX, - uint32_t threadGroupCountY, - uint32_t threadGroupCountZ); - -/// @note this enum needs to be kept in sync with HOTTILE_STATE! -enum SWR_TILE_STATE -{ - SWR_TILE_INVALID = 0, // tile is in uninitialized state and should be loaded with surface contents - // before rendering - SWR_TILE_DIRTY = 2, // tile contains newer data than surface it represents - SWR_TILE_RESOLVED = 3, // is in sync with surface it represents -}; - -/// @todo Add a good description for what attachments are and when and why you would use the -/// different SWR_TILE_STATEs. -SWR_FUNC(void, - SwrStoreTiles, - HANDLE hContext, - uint32_t attachmentMask, - SWR_TILE_STATE postStoreTileState, - const SWR_RECT& storeRect); - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil -/// @param hContext - Handle passed back from SwrCreateContext -/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear -/// @param renderTargetArrayIndex - the RT array index to clear -/// @param clearColor - color use for clearing render targets -/// @param z - depth value use for clearing depth buffer -/// @param stencil - stencil value used for clearing stencil buffer -/// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers -SWR_FUNC(void, - SwrClearRenderTarget, - HANDLE hContext, - uint32_t attachmentMask, - uint32_t renderTargetArrayIndex, - const float clearColor[4], - float z, - uint8_t stencil, - const SWR_RECT& clearRect); - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrSetRastState -/// @param hContext - Handle passed back from SwrCreateContext -/// @param pRastState - New SWR_RASTSTATE used for SwrDraw* commands -SWR_FUNC(void, SwrSetRastState, HANDLE hContext, const SWR_RASTSTATE* pRastState); - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrSetViewports -/// @param hContext - Handle passed back from SwrCreateContext -/// @param numViewports - number of viewports passed in -/// @param pViewports - Specifies extents of viewport. -/// @param pMatrices - If not specified then SWR computes a default one. -SWR_FUNC(void, - SwrSetViewports, - HANDLE hContext, - uint32_t numViewports, - const SWR_VIEWPORT* pViewports, - const SWR_VIEWPORT_MATRICES* pMatrices); - -////////////////////////////////////////////////////////////////////////// -/// @brief SwrSetScissorRects -/// @param hContext - Handle passed back from SwrCreateContext -/// @param numScissors - number of scissors passed in -/// @param pScissors - array of scissors -SWR_FUNC( - void, SwrSetScissorRects, HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors); - -////////////////////////////////////////////////////////////////////////// -/// @brief Returns a pointer to the private context state for the current -/// draw operation. This is used for external componets such as the -/// sampler. -/// -/// @note Client needs to resend private state prior to each draw call. -/// Also, SWR is responsible for the private state memory. -/// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void*, SwrGetPrivateContextState, HANDLE hContext); - -////////////////////////////////////////////////////////////////////////// -/// @brief Clients can use this to allocate memory for draw/dispatch -/// operations. The memory will automatically be freed once operation -/// has completed. Client can use this to allocate binding tables, -/// etc. needed for shader execution. -/// @param hContext - Handle passed back from SwrCreateContext -/// @param size - Size of allocation -/// @param align - Alignment needed for allocation. -SWR_FUNC(void*, SwrAllocDrawContextMemory, HANDLE hContext, uint32_t size, uint32_t align); - -////////////////////////////////////////////////////////////////////////// -/// @brief Enables stats counting -/// @param hContext - Handle passed back from SwrCreateContext -/// @param enable - If true then counts are incremented. -SWR_FUNC(void, SwrEnableStatsFE, HANDLE hContext, bool enable); - -////////////////////////////////////////////////////////////////////////// -/// @brief Enables stats counting -/// @param hContext - Handle passed back from SwrCreateContext -/// @param enable - If true then counts are incremented. -SWR_FUNC(void, SwrEnableStatsBE, HANDLE hContext, bool enable); - -////////////////////////////////////////////////////////////////////////// -/// @brief Mark end of frame - used for performance profiling -/// @param hContext - Handle passed back from SwrCreateContext -SWR_FUNC(void, SwrEndFrame, HANDLE hContext); - -////////////////////////////////////////////////////////////////////////// -/// @brief Initialize swr backend and memory internal tables -SWR_FUNC(void, SwrInit); - - -struct SWR_INTERFACE -{ - PFNSwrCreateContext pfnSwrCreateContext; - PFNSwrDestroyContext pfnSwrDestroyContext; - PFNSwrBindApiThread pfnSwrBindApiThread; - PFNSwrSaveState pfnSwrSaveState; - PFNSwrRestoreState pfnSwrRestoreState; - PFNSwrSync pfnSwrSync; - PFNSwrStallBE pfnSwrStallBE; - PFNSwrWaitForIdle pfnSwrWaitForIdle; - PFNSwrWaitForIdleFE pfnSwrWaitForIdleFE; - PFNSwrSetVertexBuffers pfnSwrSetVertexBuffers; - PFNSwrSetIndexBuffer pfnSwrSetIndexBuffer; - PFNSwrSetFetchFunc pfnSwrSetFetchFunc; - PFNSwrSetSoFunc pfnSwrSetSoFunc; - PFNSwrSetSoState pfnSwrSetSoState; - PFNSwrSetSoBuffers pfnSwrSetSoBuffers; - PFNSwrSetVertexFunc pfnSwrSetVertexFunc; - PFNSwrSetFrontendState pfnSwrSetFrontendState; - PFNSwrSetGsState pfnSwrSetGsState; - PFNSwrSetGsFunc pfnSwrSetGsFunc; - PFNSwrSetCsFunc pfnSwrSetCsFunc; - PFNSwrSetTsState pfnSwrSetTsState; - PFNSwrSetHsFunc pfnSwrSetHsFunc; - PFNSwrSetDsFunc pfnSwrSetDsFunc; - PFNSwrSetDepthStencilState pfnSwrSetDepthStencilState; - PFNSwrSetBackendState pfnSwrSetBackendState; - PFNSwrSetDepthBoundsState pfnSwrSetDepthBoundsState; - PFNSwrSetPixelShaderState pfnSwrSetPixelShaderState; - PFNSwrSetBlendState pfnSwrSetBlendState; - PFNSwrSetBlendFunc pfnSwrSetBlendFunc; - PFNSwrDraw pfnSwrDraw; - PFNSwrDrawInstanced pfnSwrDrawInstanced; - PFNSwrDrawIndexed pfnSwrDrawIndexed; - PFNSwrDrawIndexedInstanced pfnSwrDrawIndexedInstanced; - PFNSwrInvalidateTiles pfnSwrInvalidateTiles; - PFNSwrDiscardRect pfnSwrDiscardRect; - PFNSwrDispatch pfnSwrDispatch; - PFNSwrStoreTiles pfnSwrStoreTiles; - PFNSwrClearRenderTarget pfnSwrClearRenderTarget; - PFNSwrSetRastState pfnSwrSetRastState; - PFNSwrSetViewports pfnSwrSetViewports; - PFNSwrSetScissorRects pfnSwrSetScissorRects; - PFNSwrGetPrivateContextState pfnSwrGetPrivateContextState; - PFNSwrAllocDrawContextMemory pfnSwrAllocDrawContextMemory; - PFNSwrEnableStatsFE pfnSwrEnableStatsFE; - PFNSwrEnableStatsBE pfnSwrEnableStatsBE; - PFNSwrEndFrame pfnSwrEndFrame; - PFNSwrInit pfnSwrInit; -}; - -extern "C" { -typedef void(SWR_API* PFNSwrGetInterface)(SWR_INTERFACE& out_funcs); -SWR_VISIBLE void SWR_API SwrGetInterface(SWR_INTERFACE& out_funcs); -} - -#endif diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h deleted file mode 100644 index 831617c213f..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/arena.h +++ /dev/null @@ -1,490 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file arena.h - * - * @brief Arena memory manager - * The arena is convenient and fast for managing allocations for any of - * our allocations that are associated with operations and can all be freed - * once when their operation has completed. Allocations are cheap since - * most of the time its simply an increment of an offset. Also, no need to - * free individual allocations. All of the arena memory can be freed at once. - * - ******************************************************************************/ -#pragma once - -#include <mutex> -#include <algorithm> -#include <atomic> -#include "core/utils.h" - -static const size_t ARENA_BLOCK_ALIGN = 64; - -struct ArenaBlock -{ - size_t blockSize = 0; - ArenaBlock* pNext = nullptr; -}; -static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size"); - -class DefaultAllocator -{ -public: - ArenaBlock* AllocateAligned(size_t size, size_t align) - { - SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock)); - - ArenaBlock* p = new (AlignedMalloc(size, align)) ArenaBlock(); - p->blockSize = size; - return p; - } - - void Free(ArenaBlock* pMem) - { - if (pMem) - { - SWR_ASSUME_ASSERT(pMem->blockSize < size_t(0xdddddddd)); - AlignedFree(pMem); - } - } -}; - -// Caching Allocator for Arena -template <uint32_t NumBucketsT = 8, uint32_t StartBucketBitT = 12> -struct CachingAllocatorT : DefaultAllocator -{ - ArenaBlock* AllocateAligned(size_t size, size_t align) - { - SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock)); - SWR_ASSUME_ASSERT(size <= uint32_t(-1)); - - uint32_t bucket = GetBucketId(size); - - { - // search cached blocks - std::lock_guard<std::mutex> l(m_mutex); - ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket]; - ArenaBlock* pBlock = SearchBlocks(pPrevBlock, size, align); - - if (pBlock) - { - m_cachedSize -= pBlock->blockSize; - if (pBlock == m_pLastCachedBlocks[bucket]) - { - m_pLastCachedBlocks[bucket] = pPrevBlock; - } - } - else - { - pPrevBlock = &m_oldCachedBlocks[bucket]; - pBlock = SearchBlocks(pPrevBlock, size, align); - - if (pBlock) - { - m_oldCachedSize -= pBlock->blockSize; - if (pBlock == m_pOldLastCachedBlocks[bucket]) - { - m_pOldLastCachedBlocks[bucket] = pPrevBlock; - } - } - } - - if (pBlock) - { - assert(pPrevBlock && pPrevBlock->pNext == pBlock); - pPrevBlock->pNext = pBlock->pNext; - pBlock->pNext = nullptr; - - return pBlock; - } - - m_totalAllocated += size; - -#if 0 - { - static uint32_t count = 0; - char buf[128]; - sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated)); - OutputDebugStringA(buf); - } -#endif - } - - if (bucket && bucket < (CACHE_NUM_BUCKETS - 1)) - { - // Make all blocks in this bucket the same size - size = size_t(1) << (bucket + 1 + CACHE_START_BUCKET_BIT); - } - - return this->DefaultAllocator::AllocateAligned(size, align); - } - - void Free(ArenaBlock* pMem) - { - if (pMem) - { - std::unique_lock<std::mutex> l(m_mutex); - InsertCachedBlock(GetBucketId(pMem->blockSize), pMem); - } - } - - void FreeOldBlocks() - { - if (!m_cachedSize) - { - return; - } - std::lock_guard<std::mutex> l(m_mutex); - - bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE); - - for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) - { - if (doFree) - { - ArenaBlock* pBlock = m_oldCachedBlocks[i].pNext; - while (pBlock) - { - ArenaBlock* pNext = pBlock->pNext; - m_oldCachedSize -= pBlock->blockSize; - m_totalAllocated -= pBlock->blockSize; - this->DefaultAllocator::Free(pBlock); - pBlock = pNext; - } - m_oldCachedBlocks[i].pNext = nullptr; - m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i]; - } - - if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i]) - { - if (i && i < (CACHE_NUM_BUCKETS - 1)) - { - // We know that all blocks are the same size. - // Just move the list over. - m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext; - m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext; - m_cachedBlocks[i].pNext = nullptr; - if (m_pOldLastCachedBlocks[i]->pNext) - { - m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i]; - } - m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; - } - else - { - // The end buckets can have variable sized lists. - // Insert each block based on size - ArenaBlock* pBlock = m_cachedBlocks[i].pNext; - while (pBlock) - { - ArenaBlock* pNext = pBlock->pNext; - pBlock->pNext = nullptr; - m_cachedSize -= pBlock->blockSize; - InsertCachedBlock<true>(i, pBlock); - pBlock = pNext; - } - - m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; - m_cachedBlocks[i].pNext = nullptr; - } - } - } - - m_oldCachedSize += m_cachedSize; - m_cachedSize = 0; - } - - CachingAllocatorT() - { - for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) - { - m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; - m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i]; - } - } - - ~CachingAllocatorT() - { - // Free all cached blocks - for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) - { - ArenaBlock* pBlock = m_cachedBlocks[i].pNext; - while (pBlock) - { - ArenaBlock* pNext = pBlock->pNext; - this->DefaultAllocator::Free(pBlock); - pBlock = pNext; - } - pBlock = m_oldCachedBlocks[i].pNext; - while (pBlock) - { - ArenaBlock* pNext = pBlock->pNext; - this->DefaultAllocator::Free(pBlock); - pBlock = pNext; - } - } - } - -private: - static uint32_t GetBucketId(size_t blockSize) - { - uint32_t bucketId = 0; - -#if defined(BitScanReverseSizeT) - BitScanReverseSizeT((unsigned long*)&bucketId, (blockSize - 1) >> CACHE_START_BUCKET_BIT); - bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1); -#endif - - return bucketId; - } - - template <bool OldBlockT = false> - void InsertCachedBlock(uint32_t bucketId, ArenaBlock* pNewBlock) - { - SWR_ASSUME_ASSERT(bucketId < CACHE_NUM_BUCKETS); - - ArenaBlock* pPrevBlock = - OldBlockT ? &m_oldCachedBlocks[bucketId] : &m_cachedBlocks[bucketId]; - ArenaBlock* pBlock = pPrevBlock->pNext; - - while (pBlock) - { - if (pNewBlock->blockSize >= pBlock->blockSize) - { - // Insert here - break; - } - pPrevBlock = pBlock; - pBlock = pBlock->pNext; - } - - // Insert into list - SWR_ASSUME_ASSERT(pPrevBlock); - pPrevBlock->pNext = pNewBlock; - pNewBlock->pNext = pBlock; - - if (OldBlockT) - { - if (m_pOldLastCachedBlocks[bucketId] == pPrevBlock) - { - m_pOldLastCachedBlocks[bucketId] = pNewBlock; - } - - m_oldCachedSize += pNewBlock->blockSize; - } - else - { - if (m_pLastCachedBlocks[bucketId] == pPrevBlock) - { - m_pLastCachedBlocks[bucketId] = pNewBlock; - } - - m_cachedSize += pNewBlock->blockSize; - } - } - - static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align) - { - ArenaBlock* pBlock = pPrevBlock->pNext; - ArenaBlock* pPotentialBlock = nullptr; - ArenaBlock* pPotentialPrev = nullptr; - - while (pBlock) - { - if (pBlock->blockSize >= blockSize) - { - if (pBlock == AlignUp(pBlock, align)) - { - if (pBlock->blockSize == blockSize) - { - // Won't find a better match - break; - } - - // We could use this as it is larger than we wanted, but - // continue to search for a better match - pPotentialBlock = pBlock; - pPotentialPrev = pPrevBlock; - } - } - else - { - // Blocks are sorted by size (biggest first) - // So, if we get here, there are no blocks - // large enough, fall through to allocation. - pBlock = nullptr; - break; - } - - pPrevBlock = pBlock; - pBlock = pBlock->pNext; - } - - if (!pBlock) - { - // Couldn't find an exact match, use next biggest size - pBlock = pPotentialBlock; - pPrevBlock = pPotentialPrev; - } - - return pBlock; - } - - // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ... - static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT; - static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT; - static const size_t MAX_UNUSED_SIZE = sizeof(MEGABYTE); - - ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS]; - ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS]; - ArenaBlock m_oldCachedBlocks[CACHE_NUM_BUCKETS]; - ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS]; - std::mutex m_mutex; - - size_t m_totalAllocated = 0; - - size_t m_cachedSize = 0; - size_t m_oldCachedSize = 0; -}; -typedef CachingAllocatorT<> CachingAllocator; - -template <typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)> -class TArena -{ -public: - TArena(T& in_allocator) : m_allocator(in_allocator) {} - TArena() : m_allocator(m_defAllocator) {} - ~TArena() { Reset(true); } - - void* AllocAligned(size_t size, size_t align) - { - if (0 == size) - { - return nullptr; - } - - SWR_ASSERT(align <= ARENA_BLOCK_ALIGN); - - if (m_pCurBlock) - { - ArenaBlock* pCurBlock = m_pCurBlock; - size_t offset = AlignUp(m_offset, align); - - if ((offset + size) <= pCurBlock->blockSize) - { - void* pMem = PtrAdd(pCurBlock, offset); - m_offset = offset + size; - return pMem; - } - - // Not enough memory in this block, fall through to allocate - // a new block - } - - static const size_t ArenaBlockSize = BlockSizeT; - size_t blockSize = std::max(size + ARENA_BLOCK_ALIGN, ArenaBlockSize); - - // Add in one BLOCK_ALIGN unit to store ArenaBlock in. - blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN); - - ArenaBlock* pNewBlock = m_allocator.AllocateAligned( - blockSize, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned. - SWR_ASSERT(pNewBlock != nullptr); - - if (pNewBlock != nullptr) - { - m_offset = ARENA_BLOCK_ALIGN; - pNewBlock->pNext = m_pCurBlock; - - m_pCurBlock = pNewBlock; - } - - return AllocAligned(size, align); - } - - void* Alloc(size_t size) { return AllocAligned(size, 1); } - - void* AllocAlignedSync(size_t size, size_t align) - { - void* pAlloc = nullptr; - - m_mutex.lock(); - pAlloc = AllocAligned(size, align); - m_mutex.unlock(); - - return pAlloc; - } - - void* AllocSync(size_t size) - { - void* pAlloc = nullptr; - - m_mutex.lock(); - pAlloc = Alloc(size); - m_mutex.unlock(); - - return pAlloc; - } - - void Reset(bool removeAll = false) - { - m_offset = ARENA_BLOCK_ALIGN; - - if (m_pCurBlock) - { - ArenaBlock* pUsedBlocks = m_pCurBlock->pNext; - m_pCurBlock->pNext = nullptr; - while (pUsedBlocks) - { - ArenaBlock* pBlock = pUsedBlocks; - pUsedBlocks = pBlock->pNext; - - m_allocator.Free(pBlock); - } - - if (removeAll) - { - m_allocator.Free(m_pCurBlock); - m_pCurBlock = nullptr; - } - } - } - - bool IsEmpty() - { - return (m_pCurBlock == nullptr) || - (m_offset == ARENA_BLOCK_ALIGN && m_pCurBlock->pNext == nullptr); - } - -private: - ArenaBlock* m_pCurBlock = nullptr; - size_t m_offset = ARENA_BLOCK_ALIGN; - - /// @note Mutex is only used by sync allocation functions. - std::mutex m_mutex; - - DefaultAllocator m_defAllocator; - T& m_allocator; -}; - -using StdArena = TArena<DefaultAllocator>; -using CachingArena = TArena<CachingAllocator>; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp deleted file mode 100644 index bb9d6f7dc52..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ /dev/null @@ -1,420 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file backend.cpp - * - * @brief Backend handles rasterization, pixel shading and output merger - * operations. - * - ******************************************************************************/ - -#include <smmintrin.h> - -#include "backend.h" -#include "backend_impl.h" -#include "tilemgr.h" -#include "memory/tilingtraits.h" -#include "core/multisample.h" -#include "backends/gen_BackendPixelRate.hpp" - -#include <algorithm> - - -////////////////////////////////////////////////////////////////////////// -/// @brief Process compute work. -/// @param pDC - pointer to draw context (dispatch). -/// @param workerId - The unique worker ID that is assigned to this thread. -/// @param threadGroupId - the linear index for the thread group within the dispatch. -void ProcessComputeBE(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t threadGroupId, - void*& pSpillFillBuffer, - void*& pScratchSpace) -{ - SWR_CONTEXT* pContext = pDC->pContext; - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEDispatch, pDC->drawId); - - const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData(); - SWR_ASSERT(pTaskData != nullptr); - - // Ensure spill fill memory has been allocated. - size_t spillFillSize = pDC->pState->state.totalSpillFillSize; - if (spillFillSize && pSpillFillBuffer == nullptr) - { - pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD16_BYTES); - } - - size_t scratchSpaceSize = - pDC->pState->state.scratchSpaceSizePerWarp * pDC->pState->state.scratchSpaceNumWarps; - if (scratchSpaceSize && pScratchSpace == nullptr) - { - pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD16_BYTES); - } - - const API_STATE& state = GetApiState(pDC); - - SWR_CS_CONTEXT csContext{0}; - csContext.tileCounter = threadGroupId; - csContext.dispatchDims[0] = pTaskData->threadGroupCountX; - csContext.dispatchDims[1] = pTaskData->threadGroupCountY; - csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; - csContext.pTGSM = pContext->ppScratch[workerId]; - csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; - csContext.pScratchSpace = (uint8_t*)pScratchSpace; - csContext.scratchSpacePerWarp = pDC->pState->state.scratchSpaceSizePerWarp; - - state.pfnCsFunc(GetPrivateState(pDC), - pContext->threadPool.pThreadData[workerId].pWorkerPrivateData, - &csContext); - - UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup); - AR_EVENT(CSStats((HANDLE)&csContext.stats)); - - RDTSC_END(pDC->pContext->pBucketMgr, BEDispatch, 1); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Process shutdown. -/// @param pDC - pointer to draw context (dispatch). -/// @param workerId - The unique worker ID that is assigned to this thread. -/// @param threadGroupId - the linear index for the thread group within the dispatch. -void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData) -{ - // Dummy function -} - -void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData) -{ - uint32_t x, y; - MacroTileMgr::getTileIndices(macroTile, x, y); - SWR_ASSERT(x == 0 && y == 0); -} - -void ProcessStoreTileBE(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t macroTile, - STORE_TILES_DESC* pDesc, - SWR_RENDERTARGET_ATTACHMENT attachment) -{ - SWR_CONTEXT* pContext = pDC->pContext; - HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStoreTiles, pDC->drawId); - - SWR_FORMAT srcFormat; - switch (attachment) - { - case SWR_ATTACHMENT_COLOR0: - case SWR_ATTACHMENT_COLOR1: - case SWR_ATTACHMENT_COLOR2: - case SWR_ATTACHMENT_COLOR3: - case SWR_ATTACHMENT_COLOR4: - case SWR_ATTACHMENT_COLOR5: - case SWR_ATTACHMENT_COLOR6: - case SWR_ATTACHMENT_COLOR7: - srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; - break; - case SWR_ATTACHMENT_DEPTH: - srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; - break; - case SWR_ATTACHMENT_STENCIL: - srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; - break; - default: - SWR_INVALID("Unknown attachment: %d", attachment); - srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; - break; - } - - uint32_t x, y; - MacroTileMgr::getTileIndices(macroTile, x, y); - - // Only need to store the hottile if it's been rendered to... - HOTTILE* pHotTile = - pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false); - if (pHotTile) - { - // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. - if (pHotTile->state == HOTTILE_CLEAR) - { - PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat]; - SWR_ASSERT(pfnClearTiles != nullptr); - - pfnClearTiles(pDC, - hWorkerPrivateData, - attachment, - macroTile, - pHotTile->renderTargetArrayIndex, - pHotTile->clearData, - pDesc->rect); - } - - if (pHotTile->state == HOTTILE_DIRTY || - pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) - { - int32_t destX = KNOB_MACROTILE_X_DIM * x; - int32_t destY = KNOB_MACROTILE_Y_DIM * y; - - pContext->pfnStoreTile(pDC, - hWorkerPrivateData, - srcFormat, - attachment, - destX, - destY, - pHotTile->renderTargetArrayIndex, - pHotTile->pBuffer); - } - - if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) - { - if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && - pHotTile->state == HOTTILE_RESOLVED)) - { - pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState; - } - } - } - RDTSC_END(pDC->pContext->pBucketMgr, BEStoreTiles, 1); -} - -void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData) -{ - STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pData; - - unsigned long rt = 0; - uint32_t mask = pDesc->attachmentMask; - while (_BitScanForward(&rt, mask)) - { - mask &= ~(1 << rt); - ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt); - } -} - -void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t macroTile, - void* pData) -{ - DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pData; - SWR_CONTEXT* pContext = pDC->pContext; - - const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); - - for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i) - { - if (pDesc->attachmentMask & (1 << i)) - { - HOTTILE* pHotTile = - pContext->pHotTileMgr->GetHotTileNoLoad(pContext, - pDC, - macroTile, - (SWR_RENDERTARGET_ATTACHMENT)i, - pDesc->createNewTiles, - numSamples); - if (pHotTile) - { - HOTTILE_STATE newState = (HOTTILE_STATE)pDesc->newTileState;; - if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_CLEAR) - { - if (newState == HOTTILE_INVALID) - { - // This is OK for APIs that explicitly allow discards - // (for e.g. depth / stencil data) - //SWR_INVALID("Discarding valid data!"); - } - } - pHotTile->state = newState; - } - } - } -} - -template <uint32_t sampleCountT> -void BackendNullPS(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t x, - uint32_t y, - SWR_TRIANGLE_DESC& work, - RenderOutputBuffers& renderBuffers) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BENullBackend, pDC->drawId); - ///@todo: handle center multisample pattern - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - - BarycentricCoeffs coeffs; - SetupBarycentricCoeffs(&coeffs, work); - - uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers); - - SWR_PS_CONTEXT psContext; - // skip SetupPixelShaderContext(&psContext, ...); // not needed here - - RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0); - - simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); - - const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); - const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; - for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) - { - simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); - - const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); - - for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) - { - // iterate over active samples - unsigned long sample = 0; - uint32_t sampleMask = state.blendState.sampleMask; - while (_BitScanForward(&sample, sampleMask)) - { - sampleMask &= ~(1 << sample); - - simdmask coverageMask = work.coverageMask[sample] & MASK; - - if (coverageMask) - { - // offset depth/stencil buffers current sample - uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) - { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, - "Unsupported depth hot tile format"); - - const simdscalar z = - _simd_load_ps(reinterpret_cast<const float*>(pDepthSample)); - - const float minz = state.depthBoundsState.depthBoundsTestMinValue; - const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; - - coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); - } - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId); - - // calculate per sample positions - psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample)); - psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample)); - - CalcSampleBarycentrics(coeffs, psContext); - - // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, - coeffs.vZb, - coeffs.vZc, - psContext.vI.sample, - psContext.vJ.sample); - psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - - RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0); - - // interpolate user clip distance if available - if (state.backendState.clipDistanceMask) - { - coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, - work.pUserClipBuffer, - psContext.vI.sample, - psContext.vJ.sample); - } - - simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); - simdscalar stencilPassMask = vCoverageMask; - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId); - simdscalar depthPassMask = DepthStencilTest(&state, - work.triFlags.frontFacing, - work.triFlags.viewportIndex, - psContext.vZ, - pDepthSample, - vCoverageMask, - pStencilSample, - &stencilPassMask); - AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), - _simd_movemask_ps(stencilPassMask), - _simd_movemask_ps(vCoverageMask))); - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - psContext.vZ, - pDepthSample, - depthPassMask, - vCoverageMask, - pStencilSample, - stencilPassMask); - RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0); - - uint32_t statMask = _simd_movemask_ps(depthPassMask); - uint32_t statCount = _mm_popcnt_u32(statMask); - UPDATE_STAT_BE(DepthPassCount, statCount); - } - - Endtile: - ATTR_UNUSED; - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - - pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; - pStencilBuffer += - (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; - - vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx); - } - - vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy); - } - - RDTSC_END(pDC->pContext->pBucketMgr, BENullBackend, 0); -} - -PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {}; -PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; -PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid - [2] // canEarlyZ - = {}; -PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern - [SWR_INPUT_COVERAGE_COUNT][2] // centroid - [2] // forcedSampleCount - [2] // canEarlyZ - = {}; -PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT] - [2] // centroid - [2] // canEarlyZ - = {}; - -void InitBackendFuncTables() -{ - InitBackendPixelRate(); - InitBackendSingleFuncTable(gBackendSingleSample); - InitBackendSampleFuncTable(gBackendSampleRateTable); - - gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS<SWR_MULTISAMPLE_1X>; - gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS<SWR_MULTISAMPLE_2X>; - gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS<SWR_MULTISAMPLE_4X>; - gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS<SWR_MULTISAMPLE_8X>; - gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS<SWR_MULTISAMPLE_16X>; -} diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h deleted file mode 100644 index c9eb6c259e3..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ /dev/null @@ -1,70 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file backend.h - * - * @brief Backend handles rasterization, pixel shading and output merger - * operations. - * - ******************************************************************************/ -#pragma once - -#include "common/os.h" -#include "core/context.h" -#include "core/multisample.h" -#include "depthstencil.h" -#include "rdtsc_core.h" - -void ProcessComputeBE(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t threadGroupId, - void*& pSpillFillBuffer, - void*& pScratchSpace); -void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData); -void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData); -void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData); -void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t macroTile, - void* pData); -void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData); - -typedef void (*PFN_CLEAR_TILES)(DRAW_CONTEXT*, - HANDLE hWorkerData, - SWR_RENDERTARGET_ATTACHMENT rt, - uint32_t, - uint32_t, - uint32_t[4], - const SWR_RECT& rect); - -extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS]; -extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; -extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid - [2]; // canEarlyZ -extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern - [SWR_INPUT_COVERAGE_COUNT][2] // centroid - [2] // forcedSampleCount - [2] // canEarlyZ - ; -extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT] - [SWR_INPUT_COVERAGE_COUNT][2] // centroid - [2]; // canEarlyZ diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp deleted file mode 100644 index e772306faec..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp +++ /dev/null @@ -1,308 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file backend.cpp - * - * @brief Backend handles rasterization, pixel shading and output merger - * operations. - * - ******************************************************************************/ - -#include <smmintrin.h> - -#include "backend.h" -#include "backend_impl.h" -#include "tilemgr.h" -#include "memory/tilingtraits.h" -#include "core/multisample.h" - -#include <algorithm> - -template <SWR_FORMAT format> -void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value) -{ - auto lambda = [&](int32_t comp) - { - FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]); - - pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8); - }; - - const uint32_t numIter = - (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM); - - for (uint32_t i = 0; i < numIter; ++i) - { - UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda); - } -} - -template <SWR_FORMAT format> -INLINE void ClearMacroTile(DRAW_CONTEXT* pDC, - HANDLE hWorkerPrivateData, - SWR_RENDERTARGET_ATTACHMENT rt, - uint32_t macroTile, - uint32_t renderTargetArrayIndex, - uint32_t clear[4], - const SWR_RECT& rect) -{ - // convert clear color to hottile format - // clear color is in RGBA float/uint32 - - simd16vector vClear; - for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp) - { - simd16scalar vComp = _simd16_load1_ps((const float*)&clear[comp]); - - if (FormatTraits<format>::isNormalized(comp)) - { - vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp))); - vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp)); - } - vComp = FormatTraits<format>::pack(comp, vComp); - - vClear.v[FormatTraits<format>::swizzle(comp)] = vComp; - } - - uint32_t tileX, tileY; - MacroTileMgr::getTileIndices(macroTile, tileX, tileY); - - // Init to full macrotile - SWR_RECT clearTile = { - KNOB_MACROTILE_X_DIM * int32_t(tileX), - KNOB_MACROTILE_Y_DIM * int32_t(tileY), - KNOB_MACROTILE_X_DIM * int32_t(tileX + 1), - KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1), - }; - - // intersect with clear rect - clearTile &= rect; - - // translate to local hottile origin - clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, - -int32_t(tileY) * KNOB_MACROTILE_Y_DIM); - - // Make maximums inclusive (needed for convert to raster tiles) - clearTile.xmax -= 1; - clearTile.ymax -= 1; - - // convert to raster tiles - clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT); - clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT); - clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT); - clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT); - - const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); - // compute steps between raster tile samples / raster tiles / macro tile rows - const uint32_t rasterTileSampleStep = - KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8; - const uint32_t rasterTileStep = - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples; - const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep; - const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8); - - HOTTILE* pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, - pDC, - hWorkerPrivateData, - macroTile, - rt, - true, - numSamples, - renderTargetArrayIndex); - uint32_t rasterTileStartOffset = - (ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp>>( - pitch, clearTile.xmin, clearTile.ymin)) * - numSamples; - uint8_t* pRasterTileRow = - pHotTile->pBuffer + - rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, - // FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples; - - // loop over all raster tiles in the current hot tile - for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y) - { - uint8_t* pRasterTile = pRasterTileRow; - for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x) - { - for (int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++) - { - ClearRasterTile<format>(pRasterTile, vClear); - pRasterTile += rasterTileSampleStep; - } - } - pRasterTileRow += macroTileRowStep; - } - - pHotTile->state = HOTTILE_DIRTY; -} - -void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData) -{ - SWR_CONTEXT* pContext = pDC->pContext; - HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - if (KNOB_FAST_CLEAR) - { - CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData; - SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount; - uint32_t numSamples = GetNumSamples(sampleCount); - - SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason. - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEClear, pDC->drawId); - - if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) - { - unsigned long rt = 0; - uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; - while (_BitScanForward(&rt, mask)) - { - mask &= ~(1 << rt); - - HOTTILE* pHotTile = - pContext->pHotTileMgr->GetHotTile(pContext, - pDC, - hWorkerPrivateData, - macroTile, - (SWR_RENDERTARGET_ATTACHMENT)rt, - true, - numSamples, - pClear->renderTargetArrayIndex); - - // All we want to do here is to mark the hot tile as being in a "needs clear" state. - pHotTile->clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]); - pHotTile->clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]); - pHotTile->clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]); - pHotTile->clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]); - pHotTile->state = HOTTILE_CLEAR; - } - } - - if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) - { - HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, - pDC, - hWorkerPrivateData, - macroTile, - SWR_ATTACHMENT_DEPTH, - true, - numSamples, - pClear->renderTargetArrayIndex); - pHotTile->clearData[0] = *(uint32_t*)&pClear->clearDepth; - pHotTile->state = HOTTILE_CLEAR; - } - - if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) - { - HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, - pDC, - hWorkerPrivateData, - macroTile, - SWR_ATTACHMENT_STENCIL, - true, - numSamples, - pClear->renderTargetArrayIndex); - - pHotTile->clearData[0] = pClear->clearStencil; - pHotTile->state = HOTTILE_CLEAR; - } - - RDTSC_END(pDC->pContext->pBucketMgr, BEClear, 1); - } - else - { - // Legacy clear - CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData; - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEClear, pDC->drawId); - - if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) - { - uint32_t clearData[4]; - clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]); - clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]); - clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]); - clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]); - - PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT]; - SWR_ASSERT(pfnClearTiles != nullptr); - - unsigned long rt = 0; - uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; - while (_BitScanForward(&rt, mask)) - { - mask &= ~(1 << rt); - - pfnClearTiles(pDC, - hWorkerPrivateData, - (SWR_RENDERTARGET_ATTACHMENT)rt, - macroTile, - pClear->renderTargetArrayIndex, - clearData, - pClear->rect); - } - } - - if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) - { - uint32_t clearData[4]; - clearData[0] = *(uint32_t*)&pClear->clearDepth; - PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT]; - SWR_ASSERT(pfnClearTiles != nullptr); - - pfnClearTiles(pDC, - hWorkerPrivateData, - SWR_ATTACHMENT_DEPTH, - macroTile, - pClear->renderTargetArrayIndex, - clearData, - pClear->rect); - } - - if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) - { - uint32_t clearData[4]; - clearData[0] = pClear->clearStencil; - PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT]; - - pfnClearTiles(pDC, - hWorkerPrivateData, - SWR_ATTACHMENT_STENCIL, - macroTile, - pClear->renderTargetArrayIndex, - clearData, - pClear->rect); - } - - RDTSC_END(pDC->pContext->pBucketMgr, BEClear, 1); - } -} - -void InitClearTilesTable() -{ - memset(gClearTilesTable, 0, sizeof(gClearTilesTable)); - - gClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>; - gClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>; - gClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>; - gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>; - gClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>; -} diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h deleted file mode 100644 index 868419c3e4f..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h +++ /dev/null @@ -1,1300 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file backend.h - * - * @brief Backend handles rasterization, pixel shading and output merger - * operations. - * - ******************************************************************************/ -#pragma once - -#include "tilemgr.h" -#include "state.h" -#include "context.h" - - -void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]); -void InitBackendSampleFuncTable( - PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]); - -static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, - SWR_PS_CONTEXT& psContext); - - -enum SWR_BACKEND_FUNCS -{ - SWR_BACKEND_SINGLE_SAMPLE, - SWR_BACKEND_MSAA_PIXEL_RATE, - SWR_BACKEND_MSAA_SAMPLE_RATE, - SWR_BACKEND_FUNCS_MAX, -}; - -#if KNOB_SIMD_WIDTH == 8 -static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; -static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; -static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; -static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; -#define MASK 0xff -#endif - -static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, - float* pUserClipBuffer, - simdscalar const& vI, - simdscalar const& vJ) -{ - simdscalar vClipMask = _simd_setzero_ps(); - uint32_t numClipDistance = _mm_popcnt_u32(clipMask); - - for (uint32_t i = 0; i < numClipDistance; ++i) - { - // pull triangle clip distance values from clip buffer - simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++); - simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++); - simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++); - - // interpolate - simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ); - - // clip if interpolated clip distance is < 0 || NAN - simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ); - - vClipMask = _simd_or_ps(vClipMask, vCull); - } - - return _simd_movemask_ps(vClipMask); -} - -INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) -{ - static const uint32_t RasterTileColorOffsets[16]{ - 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 15, - }; - assert(sampleNum < 16); - return RasterTileColorOffsets[sampleNum]; -} - -INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) -{ - static const uint32_t RasterTileDepthOffsets[16]{ - 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 15, - }; - assert(sampleNum < 16); - return RasterTileDepthOffsets[sampleNum]; -} - -INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) -{ - static const uint32_t RasterTileStencilOffsets[16]{ - 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 15, - }; - assert(sampleNum < 16); - return RasterTileStencilOffsets[sampleNum]; -} - -template <typename T, uint32_t InputCoverage> -struct generateInputCoverage -{ - INLINE generateInputCoverage(const uint64_t* const coverageMask, - uint32_t (&inputMask)[KNOB_SIMD_WIDTH], - const uint32_t sampleMask) - { - // will need to update for avx512 - assert(KNOB_SIMD_WIDTH == 8); - - simdscalari mask[2]; - simdscalari sampleCoverage[2]; - - if (T::bIsCenterPattern) - { - // center coverage is the same for all samples; just broadcast to the sample slots - uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); - if (T::MultisampleT::numSamples == 1) - { - sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); - } - else if (T::MultisampleT::numSamples == 2) - { - sampleCoverage[0] = - _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); - } - else if (T::MultisampleT::numSamples == 4) - { - sampleCoverage[0] = _simd_set_epi32( - 0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); - } - else if (T::MultisampleT::numSamples == 8) - { - sampleCoverage[0] = _simd_set1_epi32(centerCoverage); - } - else if (T::MultisampleT::numSamples == 16) - { - sampleCoverage[0] = _simd_set1_epi32(centerCoverage); - sampleCoverage[1] = _simd_set1_epi32(centerCoverage); - } - } - else - { - simdscalari src = _simd_set1_epi32(0); - simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; - - if (T::MultisampleT::numSamples == 1) - { - mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); - } - else if (T::MultisampleT::numSamples == 2) - { - mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - } - else if (T::MultisampleT::numSamples == 4) - { - mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); - } - else if (T::MultisampleT::numSamples == 8) - { - mask[0] = _simd_set1_epi32(-1); - } - else if (T::MultisampleT::numSamples == 16) - { - mask[0] = _simd_set1_epi32(-1); - mask[1] = _simd_set1_epi32(-1); - index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); - } - - // gather coverage for samples 0-7 - sampleCoverage[0] = - _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), - (const float*)coverageMask, - index0, - _mm256_castsi256_ps(mask[0]), - 8)); - if (T::MultisampleT::numSamples > 8) - { - // gather coverage for samples 8-15 - sampleCoverage[1] = - _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), - (const float*)coverageMask, - index1, - _mm256_castsi256_ps(mask[1]), - 8)); - } - } - - mask[0] = _mm256_set_epi8(-1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - 0xC, - 0x8, - 0x4, - 0x0, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - 0xC, - 0x8, - 0x4, - 0x0); - // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane - simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); - - simdscalari packedCoverage1; - if (T::MultisampleT::numSamples > 8) - { - // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit - // lane - packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); - } - -#if (KNOB_ARCH == KNOB_ARCH_AVX) - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); - simdscalar shufRes = _mm256_shuffle_ps( - _mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - packedCoverage0 = _mm256_castps_si256( - _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); - - simdscalari packedSampleCoverage; - if (T::MultisampleT::numSamples > 8) - { - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); - shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), - _mm256_castsi256_ps(hiToLow), - _MM_SHUFFLE(1, 1, 0, 1)); - shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); - packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps( - _mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); - packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps( - _mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); - } - else - { - packedSampleCoverage = packedCoverage0; - } -#else - simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); - - simdscalari packedSampleCoverage; - if (T::MultisampleT::numSamples > 8) - { - permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); - - // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane - packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); - } - else - { - packedSampleCoverage = packedCoverage0; - } -#endif - - for (int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) - { - // convert packed sample coverage masks into single coverage masks for all samples for - // each pixel in the 4x2 - inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); - - if (!T::bForcedSampleCount) - { - // input coverage has to be anded with sample mask if MSAA isn't forced on - inputMask[i] &= sampleMask; - } - - // shift to the next pixel in the 4x2 - packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); - } - } - - INLINE generateInputCoverage(const uint64_t* const coverageMask, - simdscalar& inputCoverage, - const uint32_t sampleMask) - { - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask); - inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], - inputMask[6], - inputMask[5], - inputMask[4], - inputMask[3], - inputMask[2], - inputMask[1], - inputMask[0])); - } -}; - -template <typename T> -struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE> -{ - INLINE generateInputCoverage(const uint64_t* const coverageMask, - simdscalar& inputCoverage, - const uint32_t sampleMask) - { - // will need to update for avx512 - assert(KNOB_SIMD_WIDTH == 8); - simdscalari vec = _simd_set1_epi32(coverageMask[0]); - const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - vec = _simd_and_si(vec, bit); - vec = _simd_cmplt_epi32(_simd_setzero_si(), vec); - vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec); - inputCoverage = _simd_castsi_ps(vec); - } - - INLINE generateInputCoverage(const uint64_t* const coverageMask, - uint32_t (&inputMask)[KNOB_SIMD_WIDTH], - const uint32_t sampleMask) - { - uint32_t simdCoverage = (coverageMask[0] & MASK); - static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1; - for (int i = 0; i < KNOB_SIMD_WIDTH; i++) - { - // set all samples to covered if conservative coverage mask is set for that pixel - inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0; - } - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Centroid behaves exactly as follows : -// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center -// (even if the sample pattern does not happen to -// have a sample location there). -// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample -// index, where sample coverage is after ANDing the -// coverage with the SampleMask Rasterizer State. -// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to -// fill out 2x2 pixel stamps, the attribute is -// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the -// pixel, then the first sample covered by the SampleMask Rasterizer State is the evaluation -// point.Otherwise (full SampleMask), the pixel center is the evaluation point. -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template <typename T> -INLINE void CalcCentroidPos(SWR_PS_CONTEXT& psContext, - const SWR_MULTISAMPLE_POS& samplePos, - const uint64_t* const coverageMask, - const uint32_t sampleMask, - simdscalar const& vXSamplePosUL, - simdscalar const& vYSamplePosUL) -{ - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask); - - // Case (2) - partially covered pixel - - // scan for first covered sample per pixel in the 4x2 span - unsigned long sampleNum[KNOB_SIMD_WIDTH]; - (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0); - (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0); - (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0); - (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0); - (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0); - (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0); - (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0); - (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0); - - // look up and set the sample offsets from UL pixel corner for first covered sample - simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]), - samplePos.X(sampleNum[6]), - samplePos.X(sampleNum[5]), - samplePos.X(sampleNum[4]), - samplePos.X(sampleNum[3]), - samplePos.X(sampleNum[2]), - samplePos.X(sampleNum[1]), - samplePos.X(sampleNum[0])); - - simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]), - samplePos.Y(sampleNum[6]), - samplePos.Y(sampleNum[5]), - samplePos.Y(sampleNum[4]), - samplePos.Y(sampleNum[3]), - samplePos.Y(sampleNum[2]), - samplePos.Y(sampleNum[1]), - samplePos.Y(sampleNum[0])); - // add sample offset to UL pixel corner - vXSample = _simd_add_ps(vXSamplePosUL, vXSample); - vYSample = _simd_add_ps(vYSamplePosUL, vYSample); - - // Case (1) and case (3b) - All samples covered or not covered with full SampleMask - static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask(); - simdscalari vInputCoveragei = _simd_set_epi32(inputMask[7], - inputMask[6], - inputMask[5], - inputMask[4], - inputMask[3], - inputMask[2], - inputMask[1], - inputMask[0]); - simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); - - static const simdscalari vZero = _simd_setzero_si(); - const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); - simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); - simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); - simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); - - simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); - - // set the centroid position based on results from above - psContext.vX.centroid = - _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); - psContext.vY.centroid = - _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); - - // Case (3a) No samples covered and partial sample mask - simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); - // sample mask should never be all 0's for this case, but handle it anyways - unsigned long firstCoveredSampleMaskSample = 0; - (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) - : (firstCoveredSampleMaskSample = 0); - - simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); - - vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample)); - vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample)); - - // blend in case 3a pixel locations - psContext.vX.centroid = - _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); - psContext.vY.centroid = - _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a)); -} - -INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, - SWR_PS_CONTEXT& psContext, - const simdscalar& vXSamplePosUL, - const simdscalar& vYSamplePosUL) -{ - // evaluate I,J - psContext.vI.centroid = - vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid); - psContext.vJ.centroid = - vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid); - psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet); - psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, - coeffs.vBOneOverW, - coeffs.vCOneOverW, - psContext.vI.centroid, - psContext.vJ.centroid); -} - -INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const& z, float minz, float maxz) -{ - const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz)); - const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz)); - - return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask)); -} - -template <typename T> -INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount) -{ - // RT has to be single sample if we're in forcedMSAA mode - if (T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X)) - { - return 1; - } - // unless we're forced to single sample, in which case we run the OM at the sample count of the - // RT - else if (T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X)) - { - return GetNumSamples(blendSampleCount); - } - // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count - else - { - return T::MultisampleT::numSamples; - } -} - -inline void SetupBarycentricCoeffs(BarycentricCoeffs* coeffs, const SWR_TRIANGLE_DESC& work) -{ - // broadcast scalars - - coeffs->vIa = _simd_broadcast_ss(&work.I[0]); - coeffs->vIb = _simd_broadcast_ss(&work.I[1]); - coeffs->vIc = _simd_broadcast_ss(&work.I[2]); - - coeffs->vJa = _simd_broadcast_ss(&work.J[0]); - coeffs->vJb = _simd_broadcast_ss(&work.J[1]); - coeffs->vJc = _simd_broadcast_ss(&work.J[2]); - - coeffs->vZa = _simd_broadcast_ss(&work.Z[0]); - coeffs->vZb = _simd_broadcast_ss(&work.Z[1]); - coeffs->vZc = _simd_broadcast_ss(&work.Z[2]); - - coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet); - - coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); - coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); - coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); -} - -inline void SetupRenderBuffers(uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS], - uint8_t** pDepthBuffer, - uint8_t** pStencilBuffer, - uint32_t colorHotTileMask, - RenderOutputBuffers& renderBuffers) -{ - unsigned long index; - while (_BitScanForward(&index, colorHotTileMask)) - { - assert(index < SWR_NUM_RENDERTARGETS); - colorHotTileMask &= ~(1 << index); - pColorBuffer[index] = renderBuffers.pColor[index]; - } - - if (pDepthBuffer) - { - *pDepthBuffer = renderBuffers.pDepth; - } - - if (pStencilBuffer) - { - *pStencilBuffer = renderBuffers.pStencil; - ; - } -} - -INLINE void SetRenderHotTilesDirty(DRAW_CONTEXT* pDC, RenderOutputBuffers& renderBuffers) -{ - const API_STATE& state = GetApiState(pDC); - - unsigned long rtSlot = 0; - uint32_t colorHottileEnableMask = state.colorHottileEnable; - while (_BitScanForward(&rtSlot, colorHottileEnableMask)) - { - colorHottileEnableMask &= ~(1 << rtSlot); - renderBuffers.pColorHotTile[rtSlot]->state = HOTTILE_DIRTY; - } -} - -template <typename T> -void SetupPixelShaderContext(SWR_PS_CONTEXT* psContext, - const SWR_MULTISAMPLE_POS& samplePos, - SWR_TRIANGLE_DESC& work) -{ - psContext->pAttribs = work.pAttribs; - psContext->pPerspAttribs = work.pPerspAttribs; - psContext->frontFace = work.triFlags.frontFacing; - psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex; - psContext->viewportIndex = work.triFlags.viewportIndex; - - // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull - // attribs - psContext->I = work.I; - psContext->J = work.J; - - psContext->recipDet = work.recipDet; - psContext->pRecipW = work.pRecipW; - psContext->pSamplePosX = - samplePos.X(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosX); - psContext->pSamplePosY = - samplePos.Y(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosY); - psContext->rasterizerSampleCount = T::MultisampleT::numSamples; - psContext->sampleIndex = 0; -} - -template <typename T, bool IsSingleSample> -void CalcCentroid(SWR_PS_CONTEXT* psContext, - const SWR_MULTISAMPLE_POS& samplePos, - const BarycentricCoeffs& coeffs, - const uint64_t* const coverageMask, - uint32_t sampleMask) -{ - if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid - // positions are still different - { - // for 1x case, centroid is pixel center - psContext->vX.centroid = psContext->vX.center; - psContext->vY.centroid = psContext->vY.center; - psContext->vI.centroid = psContext->vI.center; - psContext->vJ.centroid = psContext->vJ.center; - psContext->vOneOverW.centroid = psContext->vOneOverW.center; - } - else - { - if (T::bCentroidPos) - { - ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid - if (T::bIsCenterPattern) - { - psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f)); - psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f)); - } - else - { - // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate - // coverage 2X'.. - CalcCentroidPos<T>(*psContext, - samplePos, - coverageMask, - sampleMask, - psContext->vX.UL, - psContext->vY.UL); - } - - CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL); - } - else - { - psContext->vX.centroid = psContext->vX.sample; - psContext->vY.centroid = psContext->vY.sample; - } - } -} - -template <typename T> -struct PixelRateZTestLoop -{ - PixelRateZTestLoop(DRAW_CONTEXT* DC, - uint32_t _workerId, - const SWR_TRIANGLE_DESC& Work, - const BarycentricCoeffs& Coeffs, - const API_STATE& apiState, - uint8_t*& depthBuffer, - uint8_t*& stencilBuffer, - const uint8_t ClipDistanceMask) : - pDC(DC), - workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState), - samplePos(state.rastState.samplePositions), clipDistanceMask(ClipDistanceMask), - pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){}; - - INLINE - uint32_t operator()(simdscalar& activeLanes, - SWR_PS_CONTEXT& psContext, - const CORE_BUCKETS BEDepthBucket, - uint32_t currentSimdIn8x8 = 0) - { - - uint32_t statCount = 0; - simdscalar anyDepthSamplePassed = _simd_setzero_ps(); - for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) - { - const uint8_t* pCoverageMask = (uint8_t*)&work.coverageMask[sample]; - vCoverageMask[sample] = - _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK)); - - if (!_simd_movemask_ps(vCoverageMask[sample])) - { - vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = - _simd_setzero_ps(); - continue; - } - - // offset depth/stencil buffers current sample - uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) - { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, - "Unsupported depth hot tile format"); - - const simdscalar z = _simd_load_ps(reinterpret_cast<const float*>(pDepthSample)); - - const float minz = state.depthBoundsState.depthBoundsTestMinValue; - const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; - - vCoverageMask[sample] = - _simd_and_ps(vCoverageMask[sample], - _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz))); - } - - RDTSC_BEGIN(psContext.pBucketManager, BEBarycentric, pDC->drawId); - - // calculate per sample positions - psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); - psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); - - // calc I & J per sample - CalcSampleBarycentrics(coeffs, psContext); - - if (psState.writesODepth) - { - { - // broadcast and test oDepth(psContext.vZ) written from the PS for each sample - vZ[sample] = psContext.vZ; - } - } - else - { - vZ[sample] = vplaneps( - coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); - vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); - } - - RDTSC_END(psContext.pBucketManager, BEBarycentric, 0); - - ///@todo: perspective correct vs non-perspective correct clipping? - // if clip distances are enabled, we need to interpolate for each sample - if (clipDistanceMask) - { - uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, - work.pUserClipBuffer, - psContext.vI.sample, - psContext.vJ.sample); - - vCoverageMask[sample] = - _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask)); - } - - // ZTest for this sample - ///@todo Need to uncomment out this bucket. - // RDTSC_BEGIN(psContext.pBucketManager, BEDepthBucket, pDC->drawId); - depthPassMask[sample] = vCoverageMask[sample]; - stencilPassMask[sample] = vCoverageMask[sample]; - depthPassMask[sample] = DepthStencilTest(&state, - work.triFlags.frontFacing, - work.triFlags.viewportIndex, - vZ[sample], - pDepthSample, - vCoverageMask[sample], - pStencilSample, - &stencilPassMask[sample]); - // RDTSC_END(psContext.pBucketManager, BEDepthBucket, 0); - - // early-exit if no pixels passed depth or earlyZ is forced on - if (psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample])) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - vZ[sample], - pDepthSample, - depthPassMask[sample], - vCoverageMask[sample], - pStencilSample, - stencilPassMask[sample]); - - if (!_simd_movemask_ps(depthPassMask[sample])) - { - continue; - } - } - anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); - uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); - statCount += _mm_popcnt_u32(statMask); - } - - activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes); - // return number of samples that passed depth and coverage - return statCount; - } - - // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite - simdscalar vZ[T::MultisampleT::numCoverageSamples]; - simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples]; - simdscalar depthPassMask[T::MultisampleT::numCoverageSamples]; - simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples]; - -private: - // functor inputs - DRAW_CONTEXT* pDC; - uint32_t workerId; - - const SWR_TRIANGLE_DESC& work; - const BarycentricCoeffs& coeffs; - const API_STATE& state; - const SWR_PS_STATE& psState; - const SWR_MULTISAMPLE_POS& samplePos; - const uint8_t clipDistanceMask; - uint8_t*& pDepthBuffer; - uint8_t*& pStencilBuffer; -}; - -INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT& psContext) -{ - // evaluate I,J - psContext.vI.center = - vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center); - psContext.vJ.center = - vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center); - psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet); - psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, - coeffs.vBOneOverW, - coeffs.vCOneOverW, - psContext.vI.center, - psContext.vJ.center); -} - -static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, - SWR_PS_CONTEXT& psContext) -{ - // evaluate I,J - psContext.vI.sample = - vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample); - psContext.vJ.sample = - vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample); - psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet); - psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, - coeffs.vBOneOverW, - coeffs.vCOneOverW, - psContext.vI.sample, - psContext.vJ.sample); -} - -// Merge Output to 8x2 SIMD16 Tile Format -INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC, - SWR_PS_CONTEXT& psContext, - uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], - uint32_t sample, - const SWR_BLEND_STATE* pBlendState, - const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], - simdscalar& coverageMask, - simdscalar const& depthPassMask, - uint32_t renderTargetMask, - bool useAlternateOffset, - uint32_t workerId) -{ - // type safety guaranteed from template instantiation in BEChooser<>::GetFunc - uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); - - if (useAlternateOffset) - { - rasterTileColorOffset += sizeof(simdscalar); - } - - simdvector blendSrc; - simdvector blendOut; - - unsigned long rt; - while (_BitScanForward(&rt, renderTargetMask)) - { - renderTargetMask &= ~(1 << rt); - - const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt]; - - simdscalar* pColorSample; - bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || - !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue; - if (hotTileEnable) - { - pColorSample = reinterpret_cast<simdscalar*>(pColorBase[rt] + rasterTileColorOffset); - blendSrc[0] = pColorSample[0]; - blendSrc[1] = pColorSample[2]; - blendSrc[2] = pColorSample[4]; - blendSrc[3] = pColorSample[6]; - } - else - { - pColorSample = nullptr; - } - - SWR_BLEND_CONTEXT blendContext = {0}; - { - // pfnBlendFunc may not update all channels. Initialize with PS output. - /// TODO: move this into the blend JIT. - blendOut = psContext.shaded[rt]; - - blendContext.pBlendState = pBlendState; - blendContext.src = &psContext.shaded[rt]; - blendContext.src1 = &psContext.shaded[1]; - blendContext.src0alpha = reinterpret_cast<simdvector*>(&psContext.shaded[0].w); - blendContext.sampleNum = sample; - blendContext.pDst = &blendSrc; - blendContext.result = &blendOut; - blendContext.oMask = &psContext.oMask; - blendContext.pMask = reinterpret_cast<simdscalari*>(&coverageMask); - - // Blend outputs and update coverage mask for alpha test - if (pfnBlendFunc[rt] != nullptr) - { - pfnBlendFunc[rt](&blendContext); - } - } - - // Track alpha events - AR_EVENT( - AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended)); - - // final write mask - simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); - - ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. - static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, - "Unsupported hot tile format"); - - // store with color mask - if (!pRTBlend->writeDisableRed) - { - _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[0]), outputMask, blendOut.x); - } - if (!pRTBlend->writeDisableGreen) - { - _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[2]), outputMask, blendOut.y); - } - if (!pRTBlend->writeDisableBlue) - { - _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[4]), outputMask, blendOut.z); - } - if (!pRTBlend->writeDisableAlpha) - { - _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[6]), outputMask, blendOut.w); - } - } -} - -template <typename T> -void BackendPixelRate(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t x, - uint32_t y, - SWR_TRIANGLE_DESC& work, - RenderOutputBuffers& renderBuffers) -{ - ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the - /// backend - - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelRateBackend, pDC->drawId); - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - - BarycentricCoeffs coeffs; - SetupBarycentricCoeffs(&coeffs, work); - - SWR_CONTEXT* pContext = pDC->pContext; - void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - SWR_PS_CONTEXT psContext; - const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; - SetupPixelShaderContext<T>(&psContext, samplePos, work); - - uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, - &pDepthBuffer, - &pStencilBuffer, - state.colorHottileEnable, - renderBuffers); - - bool isTileDirty = false; - - RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0); - - PixelRateZTestLoop<T> PixelRateZTest(pDC, - workerId, - work, - coeffs, - state, - pDepthBuffer, - pStencilBuffer, - state.backendState.clipDistanceMask); - - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); - - const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); - - for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) - { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); - - const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); - - for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) - { - const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); - - - simdscalar activeLanes; - if (!(work.anyCoveredSamples & MASK)) - { - goto Endtile; - }; - activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK); - - if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) - { - const uint64_t* pCoverageMask = - (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - ? &work.innerCoverageMask - : &work.coverageMask[0]; - - generateInputCoverage<T, T::InputCoverage>( - pCoverageMask, psContext.inputMask, state.blendState.sampleMask); - } - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId); - - CalcPixelBarycentrics(coeffs, psContext); - - CalcCentroid<T, false>( - &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); - - RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0); - - if (T::bForcedSampleCount) - { - // candidate pixels (that passed coverage) will cause shader invocation if any bits - // in the samplemask are set - const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32( - _simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si())); - activeLanes = _simd_and_ps(activeLanes, vSampleMask); - } - - // Early-Z? - if (T::bCanEarlyZ && !T::bForcedSampleCount) - { - uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest); - UPDATE_STAT_BE(DepthPassCount, depthPassCount); - AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); - } - - // if we have no covered samples that passed depth at this point, go to next tile - if (!_simd_movemask_ps(activeLanes)) - { - goto Endtile; - }; - - if (state.psState.usesSourceDepth) - { - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId); - // interpolate and quantize z - psContext.vZ = vplaneps( - coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); - psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0); - } - - // pixels that are currently active - psContext.activeMask = _simd_castps_si(activeLanes); - psContext.oMask = T::MultisampleT::FullSampleMask(); - - // execute pixel shader - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId); - state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext); - RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0); - - // update stats - UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes))); - AR_EVENT(PSStats((HANDLE)&psContext.stats)); - - // update active lanes to remove any discarded or oMask'd pixels - activeLanes = _simd_castsi_ps(_simd_and_si( - psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); - if (!_simd_movemask_ps(activeLanes)) - { - goto Endtile; - }; - - isTileDirty = true; - - // late-Z - if (!T::bCanEarlyZ && !T::bForcedSampleCount) - { - uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest); - UPDATE_STAT_BE(DepthPassCount, depthPassCount); - AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); - } - - // if we have no covered samples that passed depth at this point, skip OM and go to next - // tile - if (!_simd_movemask_ps(activeLanes)) - { - goto Endtile; - }; - - // output merger - // loop over all samples, broadcasting the results of the PS to all passing pixels - for (uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); - sample++) - { - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId); - // center pattern does a single coverage/depth/stencil test, standard pattern tests - // all samples - uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample; - simdscalar coverageMask, depthMask; - if (T::bForcedSampleCount) - { - coverageMask = depthMask = activeLanes; - } - else - { - coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum]; - depthMask = PixelRateZTest.depthPassMask[coverageSampleNum]; - if (!_simd_movemask_ps(depthMask)) - { - // stencil should already have been written in early/lateZ tests - RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0); - continue; - } - } - - // broadcast the results of the PS to all passing pixels - - OutputMerger8x2(pDC, - psContext, - psContext.pColorBuffer, - sample, - &state.blendState, - state.pfnBlendFunc, - coverageMask, - depthMask, - state.psState.renderTargetMask, - useAlternateOffset, - workerId); - - - if (!state.psState.forceEarlyZ && !T::bForcedSampleCount) - { - uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - PixelRateZTest.vZ[coverageSampleNum], - pDepthSample, - depthMask, - coverageMask, - pStencilSample, - PixelRateZTest.stencilPassMask[coverageSampleNum]); - } - RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0); - } - Endtile: - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId); - - for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) - { - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - - if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - { - work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - - if (useAlternateOffset) - { - unsigned long rt; - uint32_t rtMask = state.colorHottileEnable; - while (_BitScanForward(&rt, rtMask)) - { - rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += - (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; - } - } - - pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; - pStencilBuffer += - (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; - - RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0); - - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); - psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); - } - - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); - psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); - } - - if (isTileDirty) - { - SetRenderHotTilesDirty(pDC, renderBuffers); - } - - RDTSC_END(pDC->pContext->pBucketMgr, BEPixelRateBackend, 0); -} - -template <uint32_t sampleCountT = SWR_MULTISAMPLE_1X, - uint32_t isCenter = 0, - uint32_t coverage = 0, - uint32_t centroid = 0, - uint32_t forced = 0, - uint32_t canEarlyZ = 0 - > -struct SwrBackendTraits -{ - static const bool bIsCenterPattern = (isCenter == 1); - static const uint32_t InputCoverage = coverage; - static const bool bCentroidPos = (centroid == 1); - static const bool bForcedSampleCount = (forced == 1); - static const bool bCanEarlyZ = (canEarlyZ == 1); - typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT; -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp deleted file mode 100644 index 7881d36ddb9..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp +++ /dev/null @@ -1,454 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file backend.cpp - * - * @brief Backend handles rasterization, pixel shading and output merger - * operations. - * - ******************************************************************************/ - -#include <smmintrin.h> - -#include "backend.h" -#include "backend_impl.h" -#include "tilemgr.h" -#include "memory/tilingtraits.h" -#include "core/multisample.h" - -#include <algorithm> - -template <typename T> -void BackendSampleRate(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t x, - uint32_t y, - SWR_TRIANGLE_DESC& work, - RenderOutputBuffers& renderBuffers) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESampleRateBackend, pDC->drawId); - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId); - - void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - const API_STATE& state = GetApiState(pDC); - - BarycentricCoeffs coeffs; - SetupBarycentricCoeffs(&coeffs, work); - - SWR_PS_CONTEXT psContext; - const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; - SetupPixelShaderContext<T>(&psContext, samplePos, work); - - uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, - &pDepthBuffer, - &pStencilBuffer, - state.colorHottileEnable, - renderBuffers); - - bool isTileDirty = false; - - RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0); - - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); - - const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); - - for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) - { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); - - const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); - - for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) - { - const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); - - - if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) - { - const uint64_t* pCoverageMask = - (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - ? &work.innerCoverageMask - : &work.coverageMask[0]; - - generateInputCoverage<T, T::InputCoverage>( - pCoverageMask, psContext.inputMask, state.blendState.sampleMask); - } - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId); - - CalcPixelBarycentrics(coeffs, psContext); - - CalcCentroid<T, false>( - &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); - - RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0); - - for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++) - { - simdmask coverageMask = work.coverageMask[sample] & MASK; - - if (coverageMask) - { - // offset depth/stencil buffers current sample - uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) - { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, - "Unsupported depth hot tile format"); - - const simdscalar z = - _simd_load_ps(reinterpret_cast<const float*>(pDepthSample)); - - const float minz = state.depthBoundsState.depthBoundsTestMinValue; - const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; - - coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); - } - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId); - - // calculate per sample positions - psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); - psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); - - CalcSampleBarycentrics(coeffs, psContext); - - // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, - coeffs.vZb, - coeffs.vZc, - psContext.vI.sample, - psContext.vJ.sample); - psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - - RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0); - - // interpolate user clip distance if available - if (state.backendState.clipDistanceMask) - { - coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, - work.pUserClipBuffer, - psContext.vI.sample, - psContext.vJ.sample); - } - - simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); - simdscalar depthPassMask = vCoverageMask; - simdscalar stencilPassMask = vCoverageMask; - - // Early-Z? - if (T::bCanEarlyZ) - { - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, - work.triFlags.frontFacing, - work.triFlags.viewportIndex, - psContext.vZ, - pDepthSample, - vCoverageMask, - pStencilSample, - &stencilPassMask); - AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), - _simd_movemask_ps(stencilPassMask), - _simd_movemask_ps(vCoverageMask))); - RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0); - - // early-exit if no samples passed depth or earlyZ is forced on. - if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - psContext.vZ, - pDepthSample, - depthPassMask, - vCoverageMask, - pStencilSample, - stencilPassMask); - - if (!_simd_movemask_ps(depthPassMask)) - { - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - continue; - } - } - } - - psContext.sampleIndex = sample; - psContext.activeMask = _simd_castps_si(vCoverageMask); - - // execute pixel shader - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId); - state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext); - RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0); - - // update stats - UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); - AR_EVENT(PSStats((HANDLE)&psContext.stats)); - - vCoverageMask = _simd_castsi_ps(psContext.activeMask); - - if (_simd_movemask_ps(vCoverageMask)) - { - isTileDirty = true; - } - - // late-Z - if (!T::bCanEarlyZ) - { - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, - work.triFlags.frontFacing, - work.triFlags.viewportIndex, - psContext.vZ, - pDepthSample, - vCoverageMask, - pStencilSample, - &stencilPassMask); - AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), - _simd_movemask_ps(stencilPassMask), - _simd_movemask_ps(vCoverageMask))); - RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0); - - if (!_simd_movemask_ps(depthPassMask)) - { - // need to call depth/stencil write for stencil write - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - psContext.vZ, - pDepthSample, - depthPassMask, - vCoverageMask, - pStencilSample, - stencilPassMask); - - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - continue; - } - } - - uint32_t statMask = _simd_movemask_ps(depthPassMask); - uint32_t statCount = _mm_popcnt_u32(statMask); - UPDATE_STAT_BE(DepthPassCount, statCount); - - // output merger - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId); - - OutputMerger8x2(pDC, - psContext, - psContext.pColorBuffer, - sample, - &state.blendState, - state.pfnBlendFunc, - vCoverageMask, - depthPassMask, - state.psState.renderTargetMask, - useAlternateOffset, - workerId); - - // do final depth write after all pixel kills - if (!state.psState.forceEarlyZ) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - psContext.vZ, - pDepthSample, - depthPassMask, - vCoverageMask, - pStencilSample, - stencilPassMask); - } - RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0); - } - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - - Endtile: - ATTR_UNUSED; - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId); - - if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - { - work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - - if (useAlternateOffset) - { - unsigned long rt; - uint32_t rtMask = state.colorHottileEnable; - while (_BitScanForward(&rt, rtMask)) - { - rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += - (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; - } - } - - pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; - pStencilBuffer += - (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; - - RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0); - - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); - psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); - } - - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); - psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); - } - - if (isTileDirty) - { - SetRenderHotTilesDirty(pDC, renderBuffers); - } - - RDTSC_END(pDC->pContext->pBucketMgr, BESampleRateBackend, 0); -} - -// Recursive template used to auto-nest conditionals. Converts dynamic enum function -// arguments to static template arguments. -template <uint32_t... ArgsT> -struct BEChooserSampleRate -{ - // Last Arg Terminator - static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg) - { - switch (tArg) - { - case SWR_BACKEND_MSAA_SAMPLE_RATE: - return BackendSampleRate<SwrBackendTraits<ArgsT...>>; - break; - case SWR_BACKEND_SINGLE_SAMPLE: - case SWR_BACKEND_MSAA_PIXEL_RATE: - SWR_ASSERT(0 && "Invalid backend func\n"); - return nullptr; - break; - default: - SWR_ASSERT(0 && "Invalid backend func\n"); - return nullptr; - break; - } - } - - // Recursively parse args - template <typename... TArgsT> - static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs) - { - switch (tArg) - { - case SWR_INPUT_COVERAGE_NONE: - return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc( - remainingArgs...); - break; - case SWR_INPUT_COVERAGE_NORMAL: - return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc( - remainingArgs...); - break; - case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: - return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc( - remainingArgs...); - break; - default: - SWR_ASSERT(0 && "Invalid sample pattern\n"); - return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc( - remainingArgs...); - break; - } - } - - // Recursively parse args - template <typename... TArgsT> - static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) - { - switch (tArg) - { - case SWR_MULTISAMPLE_1X: - return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); - break; - case SWR_MULTISAMPLE_2X: - return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); - break; - case SWR_MULTISAMPLE_4X: - return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); - break; - case SWR_MULTISAMPLE_8X: - return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); - break; - case SWR_MULTISAMPLE_16X: - return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); - break; - default: - SWR_ASSERT(0 && "Invalid sample count\n"); - return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); - break; - } - } - - // Recursively parse args - template <typename... TArgsT> - static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs) - { - if (tArg == true) - { - return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...); - } - - return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...); - } -}; - -void InitBackendSampleFuncTable( - PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]) -{ - for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; - sampleCount++) - { - for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) - { - for (uint32_t centroid = 0; centroid < 2; centroid++) - { - for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) - { - table[sampleCount][inputCoverage][centroid][canEarlyZ] = - BEChooserSampleRate<>::GetFunc( - (SWR_MULTISAMPLE_COUNT)sampleCount, - false, - (SWR_INPUT_COVERAGE)inputCoverage, - (centroid > 0), - false, - (canEarlyZ > 0), - (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); - } - } - } - } -} diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp deleted file mode 100644 index 06f78c4b88a..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp +++ /dev/null @@ -1,428 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file backend.cpp - * - * @brief Backend handles rasterization, pixel shading and output merger - * operations. - * - ******************************************************************************/ - -#include <smmintrin.h> - -#include "backend.h" -#include "backend_impl.h" -#include "tilemgr.h" -#include "memory/tilingtraits.h" -#include "core/multisample.h" - -#include <algorithm> - -template <typename T> -void BackendSingleSample(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t x, - uint32_t y, - SWR_TRIANGLE_DESC& work, - RenderOutputBuffers& renderBuffers) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESingleSampleBackend, pDC->drawId); - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId); - - void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - const API_STATE& state = GetApiState(pDC); - - BarycentricCoeffs coeffs; - SetupBarycentricCoeffs(&coeffs, work); - - SWR_PS_CONTEXT psContext; - const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; - SetupPixelShaderContext<T>(&psContext, samplePos, work); - - uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, - &pDepthBuffer, - &pStencilBuffer, - state.colorHottileEnable, - renderBuffers); - - // Indicates backend rendered something to the color buffer - bool isTileDirty = false; - - RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 1); - - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); - - const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); - - for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) - { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); - - const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); - - for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) - { - const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); - - - simdmask coverageMask = work.coverageMask[0] & MASK; - - if (coverageMask) - { - if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) - { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, - "Unsupported depth hot tile format"); - - const simdscalar z = - _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer)); - - const float minz = state.depthBoundsState.depthBoundsTestMinValue; - const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; - - coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); - } - - if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) - { - const uint64_t* pCoverageMask = - (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - ? &work.innerCoverageMask - : &work.coverageMask[0]; - - generateInputCoverage<T, T::InputCoverage>( - pCoverageMask, psContext.inputMask, state.blendState.sampleMask); - } - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId); - - CalcPixelBarycentrics(coeffs, psContext); - - CalcCentroid<T, true>( - &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); - - // interpolate and quantize z - psContext.vZ = vplaneps( - coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); - psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - - RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 1); - - // interpolate user clip distance if available - if (state.backendState.clipDistanceMask) - { - coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, - work.pUserClipBuffer, - psContext.vI.center, - psContext.vJ.center); - } - - simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); - simdscalar depthPassMask = vCoverageMask; - simdscalar stencilPassMask = vCoverageMask; - - // Early-Z? - if (T::bCanEarlyZ) - { - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, - work.triFlags.frontFacing, - work.triFlags.viewportIndex, - psContext.vZ, - pDepthBuffer, - vCoverageMask, - pStencilBuffer, - &stencilPassMask); - AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), - _simd_movemask_ps(stencilPassMask), - _simd_movemask_ps(vCoverageMask))); - RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0); - - // early-exit if no pixels passed depth or earlyZ is forced on - if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - psContext.vZ, - pDepthBuffer, - depthPassMask, - vCoverageMask, - pStencilBuffer, - stencilPassMask); - - if (!_simd_movemask_ps(depthPassMask)) - { - goto Endtile; - } - } - } - - psContext.sampleIndex = 0; - psContext.activeMask = _simd_castps_si(vCoverageMask); - - // execute pixel shader - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId); - state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext); - RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0); - - // update stats - UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); - AR_EVENT(PSStats((HANDLE)&psContext.stats)); - - vCoverageMask = _simd_castsi_ps(psContext.activeMask); - - if (_simd_movemask_ps(vCoverageMask)) - { - isTileDirty = true; - } - - // late-Z - if (!T::bCanEarlyZ) - { - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, - work.triFlags.frontFacing, - work.triFlags.viewportIndex, - psContext.vZ, - pDepthBuffer, - vCoverageMask, - pStencilBuffer, - &stencilPassMask); - AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), - _simd_movemask_ps(stencilPassMask), - _simd_movemask_ps(vCoverageMask))); - RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0); - - if (!_simd_movemask_ps(depthPassMask)) - { - // need to call depth/stencil write for stencil write - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - psContext.vZ, - pDepthBuffer, - depthPassMask, - vCoverageMask, - pStencilBuffer, - stencilPassMask); - goto Endtile; - } - } - else - { - // for early z, consolidate discards from shader - // into depthPassMask - depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask); - } - - uint32_t statMask = _simd_movemask_ps(depthPassMask); - uint32_t statCount = _mm_popcnt_u32(statMask); - UPDATE_STAT_BE(DepthPassCount, statCount); - - // output merger - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId); - - OutputMerger8x2(pDC, - psContext, - psContext.pColorBuffer, - 0, - &state.blendState, - state.pfnBlendFunc, - vCoverageMask, - depthPassMask, - state.psState.renderTargetMask, - useAlternateOffset, - workerId); - - // do final depth write after all pixel kills - if (!state.psState.forceEarlyZ) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - psContext.vZ, - pDepthBuffer, - depthPassMask, - vCoverageMask, - pStencilBuffer, - stencilPassMask); - } - RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0); - } - - Endtile: - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId); - - work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - { - work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - - if (useAlternateOffset) - { - unsigned long rt; - uint32_t rtMask = state.colorHottileEnable; - while (_BitScanForward(&rt, rtMask)) - { - rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += - (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; - } - } - - pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; - pStencilBuffer += - (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; - - RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0); - - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); - psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); - } - - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); - psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); - } - - if (isTileDirty) - { - SetRenderHotTilesDirty(pDC, renderBuffers); - } - - RDTSC_END(pDC->pContext->pBucketMgr, BESingleSampleBackend, 0); -} - -// Recursive template used to auto-nest conditionals. Converts dynamic enum function -// arguments to static template arguments. -template <uint32_t... ArgsT> -struct BEChooserSingleSample -{ - // Last Arg Terminator - static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg) - { - switch (tArg) - { - case SWR_BACKEND_SINGLE_SAMPLE: - return BackendSingleSample<SwrBackendTraits<ArgsT...>>; - break; - case SWR_BACKEND_MSAA_PIXEL_RATE: - case SWR_BACKEND_MSAA_SAMPLE_RATE: - default: - SWR_ASSERT(0 && "Invalid backend func\n"); - return nullptr; - break; - } - } - - // Recursively parse args - template <typename... TArgsT> - static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs) - { - switch (tArg) - { - case SWR_INPUT_COVERAGE_NONE: - return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc( - remainingArgs...); - break; - case SWR_INPUT_COVERAGE_NORMAL: - return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc( - remainingArgs...); - break; - case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: - return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc( - remainingArgs...); - break; - default: - SWR_ASSERT(0 && "Invalid sample pattern\n"); - return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc( - remainingArgs...); - break; - } - } - - // Recursively parse args - template <typename... TArgsT> - static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) - { - switch (tArg) - { - case SWR_MULTISAMPLE_1X: - return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); - break; - case SWR_MULTISAMPLE_2X: - return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); - break; - case SWR_MULTISAMPLE_4X: - return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); - break; - case SWR_MULTISAMPLE_8X: - return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); - break; - case SWR_MULTISAMPLE_16X: - return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); - break; - default: - SWR_ASSERT(0 && "Invalid sample count\n"); - return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); - break; - } - } - - // Recursively parse args - template <typename... TArgsT> - static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs) - { - if (tArg == true) - { - return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...); - } - - return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...); - } -}; - -void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]) -{ - for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) - { - for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) - { - for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) - { - table[inputCoverage][isCentroid][canEarlyZ] = - BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, - false, - (SWR_INPUT_COVERAGE)inputCoverage, - (isCentroid > 0), - false, - (canEarlyZ > 0), - SWR_BACKEND_SINGLE_SAMPLE); - } - } - } -} diff --git a/src/gallium/drivers/swr/rasterizer/core/backends/meson.build b/src/gallium/drivers/swr/rasterizer/core/backends/meson.build deleted file mode 100644 index d64715dc8be..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/backends/meson.build +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright © 2017-2018 Intel Corporation - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - - -files_swr_common += custom_target( - 'gen_backend_pixel', - input : swr_gen_backends_py, - output : [ - 'gen_BackendPixelRate0.cpp', 'gen_BackendPixelRate1.cpp', - 'gen_BackendPixelRate2.cpp', 'gen_BackendPixelRate3.cpp', - 'gen_BackendPixelRate.hpp', - ], - command : [ - prog_python, '@INPUT@', - '--outdir', '@OUTDIR@', - '--dim', '5', '2', '3', '2', '2', '2', - '--numfiles', '4', - '--cpp', '--hpp', - ], - depend_files : [ swr_gen_backend_files, swr_gen_header_init_files ], -) - -files_swr_common += custom_target( - 'gen_backend_raster', - input : swr_gen_backends_py, - output : [ - 'gen_rasterizer0.cpp', 'gen_rasterizer1.cpp', - 'gen_rasterizer2.cpp', 'gen_rasterizer3.cpp', - 'gen_rasterizer.hpp', - ], - command : [ - prog_python, '@INPUT@', - '--outdir', '@OUTDIR@', - '--rast', - '--dim', '5', '2', '2', '3', '5', '2', - '--numfiles', '4', - '--cpp', '--hpp', - ], - depend_files : [ swr_gen_rasterizer_files, swr_gen_header_init_files ], -) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp deleted file mode 100644 index 36732289d76..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ /dev/null @@ -1,1976 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file binner.cpp - * - * @brief Implementation for the macrotile binner - * - ******************************************************************************/ - -#include "binner.h" -#include "context.h" -#include "frontend.h" -#include "conservativeRast.h" -#include "pa.h" -#include "rasterizer.h" -#include "rdtsc_core.h" -#include "tilemgr.h" - -// Function Prototype -template <typename SIMD_T, uint32_t SIMD_WIDTH> -void BinPostSetupLinesImpl(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - Vec4<SIMD_T> prim[], - Float<SIMD_T> recipW[], - uint32_t primMask, - Integer<SIMD_T> const& primID, - Integer<SIMD_T> const& viewportIdx, - Integer<SIMD_T> const& rtIdx); - -template <typename SIMD_T, uint32_t SIMD_WIDTH> -void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - Vec4<SIMD_T> prim[], - uint32_t primMask, - Integer<SIMD_T> const& primID, - Integer<SIMD_T> const& viewportIdx, - Integer<SIMD_T> const& rtIdx); - -////////////////////////////////////////////////////////////////////////// -/// @brief Processes attributes for the backend based on linkage mask and -/// linkage map. Essentially just doing an SOA->AOS conversion and pack. -/// @param pDC - Draw context -/// @param pa - Primitive Assembly state -/// @param linkageMask - Specifies which VS outputs are routed to PS. -/// @param pLinkageMap - maps VS attribute slot to PS slot -/// @param triIndex - Triangle to process attributes for -/// @param pBuffer - Output result -template <typename NumVertsT, - typename IsSwizzledT, - typename HasConstantInterpT, - typename IsDegenerate> -INLINE void ProcessAttributes( - DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t triIndex, uint32_t primId, float* pBuffer) -{ - static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT"); - const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; - // Conservative Rasterization requires degenerate tris to have constant attribute interpolation - uint32_t constantInterpMask = - IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask; - const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex; - const PRIMITIVE_TOPOLOGY topo = pa.binTopology; - - static const float constTable[3][4] = { - {0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 1.0f}, {1.0f, 1.0f, 1.0f, 1.0f}}; - - for (uint32_t i = 0; i < backendState.numAttributes; ++i) - { - uint32_t inputSlot; - if (IsSwizzledT::value) - { - SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i]; - inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib; - } - else - { - inputSlot = backendState.vertexAttribOffset + i; - } - - simd4scalar attrib[3]; // triangle attribs (always 4 wide) - float* pAttribStart = pBuffer; - - if (HasConstantInterpT::value || IsDegenerate::value) - { - if (CheckBit(constantInterpMask, i)) - { - uint32_t vid; - uint32_t adjustedTriIndex; - static const uint32_t tristripProvokingVertex[] = {0, 2, 1}; - static const int32_t quadProvokingTri[2][4] = {{0, 0, 0, 1}, {0, -1, 0, 0}}; - static const uint32_t quadProvokingVertex[2][4] = {{0, 1, 2, 2}, {0, 1, 1, 2}}; - static const int32_t qstripProvokingTri[2][4] = {{0, 0, 0, 1}, {-1, 0, 0, 0}}; - static const uint32_t qstripProvokingVertex[2][4] = {{0, 1, 2, 1}, {0, 0, 2, 1}}; - - switch (topo) - { - case TOP_QUAD_LIST: - adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex]; - vid = quadProvokingVertex[triIndex & 1][provokingVertex]; - break; - case TOP_QUAD_STRIP: - adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex]; - vid = qstripProvokingVertex[triIndex & 1][provokingVertex]; - break; - case TOP_TRIANGLE_STRIP: - adjustedTriIndex = triIndex; - vid = - (triIndex & 1) ? tristripProvokingVertex[provokingVertex] : provokingVertex; - break; - default: - adjustedTriIndex = triIndex; - vid = provokingVertex; - break; - } - - pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib); - - for (uint32_t i = 0; i < NumVertsT::value; ++i) - { - SIMD128::store_ps(pBuffer, attrib[vid]); - pBuffer += 4; - } - } - else - { - pa.AssembleSingle(inputSlot, triIndex, attrib); - - for (uint32_t i = 0; i < NumVertsT::value; ++i) - { - SIMD128::store_ps(pBuffer, attrib[i]); - pBuffer += 4; - } - } - } - else - { - pa.AssembleSingle(inputSlot, triIndex, attrib); - - for (uint32_t i = 0; i < NumVertsT::value; ++i) - { - SIMD128::store_ps(pBuffer, attrib[i]); - pBuffer += 4; - } - } - - // pad out the attrib buffer to 3 verts to ensure the triangle - // interpolation code in the pixel shader works correctly for the - // 3 topologies - point, line, tri. This effectively zeros out the - // effect of the missing vertices in the triangle interpolation. - for (uint32_t v = NumVertsT::value; v < 3; ++v) - { - SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]); - pBuffer += 4; - } - - // check for constant source overrides - if (IsSwizzledT::value) - { - uint32_t mask = backendState.swizzleMap[i].componentOverrideMask; - if (mask) - { - unsigned long comp; - while (_BitScanForward(&comp, mask)) - { - mask &= ~(1 << comp); - - float constantValue = 0.0f; - switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource) - { - case SWR_CONSTANT_SOURCE_CONST_0000: - case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT: - case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT: - constantValue = constTable[backendState.swizzleMap[i].constantSource][comp]; - break; - case SWR_CONSTANT_SOURCE_PRIM_ID: - constantValue = *(float*)&primId; - break; - } - - // apply constant value to all 3 vertices - for (uint32_t v = 0; v < 3; ++v) - { - pAttribStart[comp + v * 4] = constantValue; - } - } - } - } - } -} - -typedef void (*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*); - -struct ProcessAttributesChooser -{ - typedef PFN_PROCESS_ATTRIBUTES FuncType; - - template <typename... ArgsB> - static FuncType GetFunc() - { - return ProcessAttributes<ArgsB...>; - } -}; - -PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, - bool IsSwizzled, - bool HasConstantInterp, - bool IsDegenerate = false) -{ - return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc( - IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Processes enabled user clip distances. Loads the active clip -/// distances from the PA, sets up barycentric equations, and -/// stores the results to the output buffer -/// @param pa - Primitive Assembly state -/// @param primIndex - primitive index to process -/// @param clipDistMask - mask of enabled clip distances -/// @param pUserClipBuffer - buffer to store results -template <uint32_t NumVerts> -void ProcessUserClipDist(const SWR_BACKEND_STATE& state, - PA_STATE& pa, - uint32_t primIndex, - float* pRecipW, - float* pUserClipBuffer) -{ - unsigned long clipDist; - uint32_t clipDistMask = state.clipDistanceMask; - while (_BitScanForward(&clipDist, clipDistMask)) - { - clipDistMask &= ~(1 << clipDist); - uint32_t clipSlot = clipDist >> 2; - uint32_t clipComp = clipDist & 0x3; - uint32_t clipAttribSlot = - clipSlot == 0 ? state.vertexClipCullOffset : state.vertexClipCullOffset + 1; - - simd4scalar primClipDist[3]; - pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); - - float vertClipDist[NumVerts]; - for (uint32_t e = 0; e < NumVerts; ++e) - { - OSALIGNSIMD(float) aVertClipDist[4]; - SIMD128::store_ps(aVertClipDist, primClipDist[e]); - vertClipDist[e] = aVertClipDist[clipComp]; - }; - - // setup plane equations for barycentric interpolation in the backend - float baryCoeff[NumVerts]; - float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1]; - for (uint32_t e = 0; e < NumVerts - 1; ++e) - { - baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last; - } - baryCoeff[NumVerts - 1] = last; - - for (uint32_t e = 0; e < NumVerts; ++e) - { - *(pUserClipBuffer++) = baryCoeff[e]; - } - } -} - -INLINE -void TransposeVertices(simd4scalar (&dst)[8], - const simdscalar& src0, - const simdscalar& src1, - const simdscalar& src2) -{ - vTranspose3x8(dst, src0, src1, src2); -} - -INLINE -void TransposeVertices(simd4scalar (&dst)[16], - const simd16scalar& src0, - const simd16scalar& src1, - const simd16scalar& src2) -{ - vTranspose4x16( - reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps()); -} - -#if KNOB_ENABLE_EARLY_RAST - -#define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT) -#define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT) - -template <typename SIMD_T> -struct EarlyRastHelper -{ -}; - -template <> -struct EarlyRastHelper<SIMD256> -{ - static SIMD256::Integer InitShiftCntrl() - { - return SIMD256::set_epi32(24, 25, 26, 27, 28, 29, 30, 31); - } -}; - -#if USE_SIMD16_FRONTEND -template <> -struct EarlyRastHelper<SIMD512> -{ - static SIMD512::Integer InitShiftCntrl() - { - return SIMD512::set_epi32(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); - } -}; - -#endif -////////////////////////////////////////////////////////////////////////// -/// @brief Early Rasterizer (ER); triangles that fit small (e.g. 4x4) tile -/// (ER tile) can be rasterized as early as in binner to check if -/// they cover any pixels. If not - the triangles can be -/// culled in binner. -/// -/// @param er_bbox - coordinates of ER tile for each triangle -/// @param vAi - A coefficients of triangle edges -/// @param vBi - B coefficients of triangle edges -/// @param vXi - X coordinates of triangle vertices -/// @param vYi - Y coordinates of triangle vertices -/// @param frontWindingTris - mask indicating CCW/CW triangles -/// @param triMask - mask for valid SIMD lanes (triangles) -/// @param oneTileMask - defines triangles for ER to work on -/// (tris that fit into ER tile) -template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT> -uint32_t SIMDCALL EarlyRasterizer(DRAW_CONTEXT* pDC, - SIMDBBOX_T<SIMD_T>& er_bbox, - Integer<SIMD_T> (&vAi)[3], - Integer<SIMD_T> (&vBi)[3], - Integer<SIMD_T> (&vXi)[3], - Integer<SIMD_T> (&vYi)[3], - uint32_t cwTrisMask, - uint32_t triMask, - uint32_t oneTileMask) -{ - // step to pixel center of top-left pixel of the triangle bbox - Integer<SIMD_T> vTopLeftX = - SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin); - vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2)); - - Integer<SIMD_T> vTopLeftY = - SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin); - vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2)); - - // negate A and B for CW tris - Integer<SIMD_T> vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1)); - Integer<SIMD_T> vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1)); - Integer<SIMD_T> vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1)); - Integer<SIMD_T> vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1)); - Integer<SIMD_T> vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1)); - Integer<SIMD_T> vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1)); - - RDTSC_EVENT(pDC->pContext->pBucketMgr, - FEEarlyRastEnter, - _mm_popcnt_u32(oneTileMask & triMask), - 0); - - Integer<SIMD_T> vShiftCntrl = EarlyRastHelper<SIMD_T>::InitShiftCntrl(); - Integer<SIMD_T> vCwTris = SIMD_T::set1_epi32(cwTrisMask); - Integer<SIMD_T> vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl); - - vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask))); - vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask))); - vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask))); - vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask))); - vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask))); - vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask))); - - // evaluate edge equations at top-left pixel - Integer<SIMD_T> vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]); - Integer<SIMD_T> vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]); - Integer<SIMD_T> vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]); - - Integer<SIMD_T> vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]); - Integer<SIMD_T> vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]); - Integer<SIMD_T> vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]); - - Integer<SIMD_T> vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0); - Integer<SIMD_T> vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1); - Integer<SIMD_T> vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2); - - Integer<SIMD_T> vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0); - Integer<SIMD_T> vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1); - Integer<SIMD_T> vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2); - - Integer<SIMD_T> vEdge0 = SIMD_T::add_epi32(vAX0, vBY0); - Integer<SIMD_T> vEdge1 = SIMD_T::add_epi32(vAX1, vBY1); - Integer<SIMD_T> vEdge2 = SIMD_T::add_epi32(vAX2, vBY2); - - vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0); - vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1); - vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2); - - // top left rule - Integer<SIMD_T> vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1)); - Integer<SIMD_T> vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1)); - Integer<SIMD_T> vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1)); - - // vA < 0 - vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0]))); - vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1]))); - vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2]))); - - // vA == 0 && vB < 0 - Integer<SIMD_T> vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si()); - Integer<SIMD_T> vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si()); - Integer<SIMD_T> vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si()); - - vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]); - vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]); - vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]); - - vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0))); - vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1))); - vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2))); - -#if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4 - // Go down - // coverage pixel 0 - Integer<SIMD_T> vMask0 = SIMD_T::and_si(vEdge0, vEdge1); - vMask0 = SIMD_T::and_si(vMask0, vEdge2); - - // coverage pixel 1 - Integer<SIMD_T> vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]); - Integer<SIMD_T> vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]); - Integer<SIMD_T> vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]); - Integer<SIMD_T> vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask1 = SIMD_T::and_si(vMask1, vEdge2N); - - // coverage pixel 2 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - Integer<SIMD_T> vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask2 = SIMD_T::and_si(vMask2, vEdge2N); - - // coverage pixel 3 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - Integer<SIMD_T> vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask3 = SIMD_T::and_si(vMask3, vEdge2N); - - // One step to the right and then up - - // coverage pixel 4 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]); - Integer<SIMD_T> vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask4 = SIMD_T::and_si(vMask4, vEdge2N); - - // coverage pixel 5 - vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - Integer<SIMD_T> vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask5 = SIMD_T::and_si(vMask5, vEdge2N); - - // coverage pixel 6 - vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - Integer<SIMD_T> vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask6 = SIMD_T::and_si(vMask6, vEdge2N); - - // coverage pixel 7 - vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - Integer<SIMD_T> vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask7 = SIMD_T::and_si(vMask7, vEdge2N); - - Integer<SIMD_T> vLit1 = SIMD_T::or_si(vMask0, vMask1); - vLit1 = SIMD_T::or_si(vLit1, vMask2); - vLit1 = SIMD_T::or_si(vLit1, vMask3); - vLit1 = SIMD_T::or_si(vLit1, vMask4); - vLit1 = SIMD_T::or_si(vLit1, vMask5); - vLit1 = SIMD_T::or_si(vLit1, vMask6); - vLit1 = SIMD_T::or_si(vLit1, vMask7); - - // Step to the right and go down again - - // coverage pixel 0 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]); - vMask0 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask0 = SIMD_T::and_si(vMask0, vEdge2N); - - // coverage pixel 1 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask1 = SIMD_T::and_si(vMask1, vEdge2N); - - // coverage pixel 2 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask2 = SIMD_T::and_si(vMask2, vEdge2N); - - // coverage pixel 3 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask3 = SIMD_T::and_si(vMask3, vEdge2N); - - // And for the last time - to the right and up - - // coverage pixel 4 - vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]); - vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask4 = SIMD_T::and_si(vMask4, vEdge2N); - - // coverage pixel 5 - vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask5 = SIMD_T::and_si(vMask5, vEdge2N); - - // coverage pixel 6 - vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask6 = SIMD_T::and_si(vMask6, vEdge2N); - - // coverage pixel 7 - vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N); - vMask7 = SIMD_T::and_si(vMask7, vEdge2N); - - Integer<SIMD_T> vLit2 = SIMD_T::or_si(vMask0, vMask1); - vLit2 = SIMD_T::or_si(vLit2, vMask2); - vLit2 = SIMD_T::or_si(vLit2, vMask3); - vLit2 = SIMD_T::or_si(vLit2, vMask4); - vLit2 = SIMD_T::or_si(vLit2, vMask5); - vLit2 = SIMD_T::or_si(vLit2, vMask6); - vLit2 = SIMD_T::or_si(vLit2, vMask7); - - Integer<SIMD_T> vLit = SIMD_T::or_si(vLit1, vLit2); - -#else - // Generic algorithm sweeping in row by row order - Integer<SIMD_T> vRowMask[ER_SIMD_TILE_Y_DIM]; - - Integer<SIMD_T> vEdge0N = vEdge0; - Integer<SIMD_T> vEdge1N = vEdge1; - Integer<SIMD_T> vEdge2N = vEdge2; - - for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++) - { - // Store edge values at the beginning of the row - Integer<SIMD_T> vRowEdge0 = vEdge0N; - Integer<SIMD_T> vRowEdge1 = vEdge1N; - Integer<SIMD_T> vRowEdge2 = vEdge2N; - - Integer<SIMD_T> vColMask[ER_SIMD_TILE_X_DIM]; - - for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++) - { - vColMask[col] = SIMD_T::and_si(vEdge0N, vEdge1N); - vColMask[col] = SIMD_T::and_si(vColMask[col], vEdge2N); - - vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]); - } - vRowMask[row] = vColMask[0]; - for (uint32_t col = 1; col < ER_SIMD_TILE_X_DIM; col++) - { - vRowMask[row] = SIMD_T::or_si(vRowMask[row], vColMask[col]); - } - // Restore values and go to the next row - vEdge0N = vRowEdge0; - vEdge1N = vRowEdge1; - vEdge2N = vRowEdge2; - - vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); - vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); - vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - } - - // compress all masks - Integer<SIMD_T> vLit = vRowMask[0]; - for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++) - { - vLit = SIMD_T::or_si(vLit, vRowMask[row]); - } - -#endif - // Check which triangles has any pixel lit - uint32_t maskLit = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit)); - uint32_t maskUnlit = ~maskLit & oneTileMask; - - uint32_t oldTriMask = triMask; - triMask &= ~maskUnlit; - - if (triMask ^ oldTriMask) - { - RDTSC_EVENT(pDC->pContext->pBucketMgr, - FEEarlyRastExit, - _mm_popcnt_u32(triMask & oneTileMask), - 0); - } - return triMask; -} - -#endif // Early rasterizer - -////////////////////////////////////////////////////////////////////////// -/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping -/// culling, viewport transform, etc. -/// @param pDC - pointer to draw context. -/// @param pa - The primitive assembly object. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param tri - Contains triangle position data for SIMDs worth of triangles. -/// @param primID - Primitive ID for each triangle. -/// @param viewportIdx - viewport array index for each triangle. -/// @tparam CT - ConservativeRastFETraits -template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT> -void SIMDCALL BinTrianglesImpl(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - Vec4<SIMD_T> tri[3], - uint32_t triMask, - Integer<SIMD_T> const& primID, - Integer<SIMD_T> const& viewportIdx, - Integer<SIMD_T> const& rtIdx) -{ - const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx); - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinTriangles, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const SWR_FRONTEND_STATE& feState = state.frontendState; - - MacroTileMgr* pTileMgr = pDC->pTileMgr; - - Float<SIMD_T> vRecipW0 = SIMD_T::set1_ps(1.0f); - Float<SIMD_T> vRecipW1 = SIMD_T::set1_ps(1.0f); - Float<SIMD_T> vRecipW2 = SIMD_T::set1_ps(1.0f); - - if (feState.vpTransformDisable) - { - // RHW is passed in directly when VP transform is disabled - vRecipW0 = tri[0].v[3]; - vRecipW1 = tri[1].v[3]; - vRecipW2 = tri[2].v[3]; - } - else - { - // Perspective divide - vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w); - vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w); - vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w); - - tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0); - tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1); - tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2); - - tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0); - tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1); - tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2); - - tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0); - tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1); - tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2); - - // Viewport transform to screen space coords - if (pa.viewportArrayActive) - { - viewportTransform<3>(tri, state.vpMatrices, viewportIdx); - } - else - { - viewportTransform<3>(tri, state.vpMatrices); - } - } - - // Adjust for pixel center location - Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation); - - tri[0].x = SIMD_T::add_ps(tri[0].x, offset); - tri[0].y = SIMD_T::add_ps(tri[0].y, offset); - - tri[1].x = SIMD_T::add_ps(tri[1].x, offset); - tri[1].y = SIMD_T::add_ps(tri[1].y, offset); - - tri[2].x = SIMD_T::add_ps(tri[2].x, offset); - tri[2].y = SIMD_T::add_ps(tri[2].y, offset); - - // Set vXi, vYi to required fixed point precision - Integer<SIMD_T> vXi[3], vYi[3]; - FPToFixedPoint<SIMD_T>(tri, vXi, vYi); - - // triangle setup - Integer<SIMD_T> vAi[3], vBi[3]; - triangleSetupABIntVertical(vXi, vYi, vAi, vBi); - - // determinant - Integer<SIMD_T> vDet[2]; - calcDeterminantIntVertical(vAi, vBi, vDet); - - // cull zero area - uint32_t maskLo = - SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si()))); - uint32_t maskHi = - SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si()))); - - uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2)); - - // don't cull degenerate triangles if we're conservatively rasterizing - uint32_t origTriMask = triMask; - if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value) - { - triMask &= ~cullZeroAreaMask; - } - - // determine front winding tris - // CW +det - // CCW det < 0; - // 0 area triangles are marked as backfacing regardless of winding order, - // which is required behavior for conservative rast and wireframe rendering - uint32_t frontWindingTris; - if (rastState.frontWinding == SWR_FRONTWINDING_CW) - { - maskLo = SIMD_T::movemask_pd( - SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si()))); - maskHi = SIMD_T::movemask_pd( - SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si()))); - } - else - { - maskLo = SIMD_T::movemask_pd( - SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0]))); - maskHi = SIMD_T::movemask_pd( - SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1]))); - } - frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2)); - - // cull - uint32_t cullTris; - switch ((SWR_CULLMODE)rastState.cullMode) - { - case SWR_CULLMODE_BOTH: - cullTris = 0xffffffff; - break; - case SWR_CULLMODE_NONE: - cullTris = 0x0; - break; - case SWR_CULLMODE_FRONT: - cullTris = frontWindingTris; - break; - // 0 area triangles are marked as backfacing, which is required behavior for conservative - // rast - case SWR_CULLMODE_BACK: - cullTris = ~frontWindingTris; - break; - default: - SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); - cullTris = 0x0; - break; - } - - triMask &= ~cullTris; - - if (origTriMask ^ triMask) - { - RDTSC_EVENT(pDC->pContext->pBucketMgr, - FECullZeroAreaAndBackface, - _mm_popcnt_u32(origTriMask ^ triMask), - 0); - } - - AR_EVENT(CullInfoEvent(pDC->drawId, cullZeroAreaMask, cullTris, origTriMask)); - - /// Note: these variable initializations must stay above any 'goto endBenTriangles' - // compute per tri backface - uint32_t frontFaceMask = frontWindingTris; - uint32_t* pPrimID = (uint32_t*)&primID; - const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx; - uint32_t triIndex = 0; - - uint32_t edgeEnable; - PFN_WORK_FUNC pfnWork; - if (CT::IsConservativeT::value) - { - // determine which edges of the degenerate tri, if any, are valid to rasterize. - // used to call the appropriate templated rasterizer function - if (cullZeroAreaMask > 0) - { - // e0 = v1-v0 - const Integer<SIMD_T> x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]); - const Integer<SIMD_T> y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]); - - uint32_t e0Mask = - SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask))); - - // e1 = v2-v1 - const Integer<SIMD_T> x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]); - const Integer<SIMD_T> y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]); - - uint32_t e1Mask = - SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask))); - - // e2 = v0-v2 - // if v0 == v1 & v1 == v2, v0 == v2 - uint32_t e2Mask = e0Mask & e1Mask; - SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512"); - - // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2 - // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001 - e0Mask = pdep_u32(e0Mask, 0x00249249); - - // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010 - e1Mask = pdep_u32(e1Mask, 0x00492492); - - // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100 - e2Mask = pdep_u32(e2Mask, 0x00924924); - - edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask))); - } - else - { - edgeEnable = 0x00FFFFFF; - } - } - else - { - // degenerate triangles won't be sent to rasterizer; just enable all edges - pfnWork = GetRasterizerFunc(rastState.sampleCount, - rastState.bIsCenterPattern, - (rastState.conservativeRast > 0), - (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, - EdgeValToEdgeState(ALL_EDGES_VALID), - (state.scissorsTileAligned == false)); - } - - SIMDBBOX_T<SIMD_T> bbox; - - if (!triMask) - { - goto endBinTriangles; - } - - // Calc bounding box of triangles - calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox); - - // determine if triangle falls between pixel centers and discard - // only discard for non-MSAA case and when conservative rast is disabled - // (xmin + 127) & ~255 - // (xmax + 128) & ~255 - if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) && - (!CT::IsConservativeT::value)) - { - origTriMask = triMask; - - int cullCenterMask; - - { - Integer<SIMD_T> xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127)); - xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255)); - Integer<SIMD_T> xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128)); - xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255)); - - Integer<SIMD_T> vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax); - - Integer<SIMD_T> ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127)); - ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255)); - Integer<SIMD_T> ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128)); - ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255)); - - Integer<SIMD_T> vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax); - - vMaskV = SIMD_T::or_si(vMaskH, vMaskV); - cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV)); - } - - triMask &= ~cullCenterMask; - - if (origTriMask ^ triMask) - { - RDTSC_EVENT(pDC->pContext->pBucketMgr, - FECullBetweenCenters, - _mm_popcnt_u32(origTriMask ^ triMask), - 0); - } - } - - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is - // exclusive. Gather the AOS effective scissor rects based on the per-prim VP index. - /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - { - Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax; - if (pa.viewportArrayActive) - - { - GatherScissors(&state.scissorsInFixedPoint[0], - pViewportIndex, - scisXmin, - scisYmin, - scisXmax, - scisYmax); - } - else // broadcast fast path for non-VPAI case. - { - scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax); - } - - // Make triangle bbox inclusive - bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)); - bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)); - - bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin); - bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin); - bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax); - bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax); - } - - if (CT::IsConservativeT::value) - { - // in the case where a degenerate triangle is on a scissor edge, we need to make sure the - // primitive bbox has some area. Bump the xmax/ymax edges out - - Integer<SIMD_T> topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax); - bbox.ymax = SIMD_T::blendv_epi32( - bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom); - - Integer<SIMD_T> leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax); - bbox.xmax = SIMD_T::blendv_epi32( - bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight); - } - - // Cull tris completely outside scissor - { - Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); - Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); - Integer<SIMD_T> maskOutsideScissorXY = - SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); - triMask = triMask & ~maskOutsideScissor; - } - -#if KNOB_ENABLE_EARLY_RAST - if (rastState.sampleCount == SWR_MULTISAMPLE_1X && !CT::IsConservativeT::value) - { - // Try early rasterization - culling small triangles which do not cover any pixels - - // convert to ER tiles - SIMDBBOX_T<SIMD_T> er_bbox; - - er_bbox.xmin = - SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin); - er_bbox.xmax = - SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax); - er_bbox.ymin = - SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin); - er_bbox.ymax = - SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax); - - Integer<SIMD_T> vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax); - Integer<SIMD_T> vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax); - - // Take only triangles that fit into ER tile - uint32_t oneTileMask = - triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY))); - - if (oneTileMask) - { - // determine CW tris (det > 0) - uint32_t maskCwLo = SIMD_T::movemask_pd( - SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si()))); - uint32_t maskCwHi = SIMD_T::movemask_pd( - SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si()))); - uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2)); - - // Try early rasterization - triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>( - pDC, er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask); - - if (!triMask) - { - RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1); - return; - } - } - } -#endif - -endBinTriangles: - - - if (!triMask) - { - RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1); - return; - } - - // Send surviving triangles to the line or point binner based on fill mode - if (rastState.fillMode == SWR_FILLMODE_WIREFRAME) - { - // Simple non-conformant wireframe mode, useful for debugging - // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD - Vec4<SIMD_T> line[2]; - Float<SIMD_T> recipW[2]; - - line[0] = tri[0]; - line[1] = tri[1]; - recipW[0] = vRecipW0; - recipW[1] = vRecipW1; - - BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>( - pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); - - line[0] = tri[1]; - line[1] = tri[2]; - recipW[0] = vRecipW1; - recipW[1] = vRecipW2; - - BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>( - pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); - - line[0] = tri[2]; - line[1] = tri[0]; - recipW[0] = vRecipW2; - recipW[1] = vRecipW0; - - BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>( - pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); - - RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1); - return; - } - else if (rastState.fillMode == SWR_FILLMODE_POINT) - { - // Bin 3 points - BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>( - pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx); - BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>( - pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx); - BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>( - pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx); - - RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1); - return; - } - - // Convert triangle bbox to macrotile units. - bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin); - bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin); - bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax); - bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax); - - OSALIGNSIMD16(uint32_t) - aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax); - - // transpose verts needed for backend - /// @todo modify BE to take non-transformed verts - OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH]; - OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH]; - OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH]; - OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH]; - - TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x); - TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y); - TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z); - TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2); - - // scan remaining valid triangles and bin each separately - while (_BitScanForward((unsigned long*)&triIndex, triMask)) - { - uint32_t linkageCount = state.backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - bool isDegenerate; - if (CT::IsConservativeT::value) - { - // only rasterize valid edges if we have a degenerate primitive - int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID; - work.pfnWork = - GetRasterizerFunc(rastState.sampleCount, - rastState.bIsCenterPattern, - (rastState.conservativeRast > 0), - (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, - EdgeValToEdgeState(triEdgeEnable), - (state.scissorsTileAligned == false)); - - // Degenerate triangles are required to be constant interpolated - isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false; - } - else - { - isDegenerate = false; - work.pfnWork = pfnWork; - } - - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = - GetProcessAttributesFunc(3, - state.backendState.swizzleEnable, - state.backendState.constantInterpolationMask, - isDegenerate); - - TRIANGLE_WORK_DESC& desc = work.desc.tri; - - desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); - desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; - desc.triFlags.viewportIndex = pViewportIndex[triIndex]; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store active attribs - float* pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); - desc.pAttribs = pAttribs; - desc.numAttribs = linkageCount; - pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs); - - // store triangle vertex data - desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - - SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); - SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); - SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); - SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); - - // store user clip distances - if (state.backendState.clipDistanceMask) - { - uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask); - desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); - ProcessUserClipDist<3>( - state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer); - } - - for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) - { - for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x) - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(x, y, &work); - } - } - } - - triMask &= ~(1 << triIndex); - } - - RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1); -} - -template <typename CT> -void BinTriangles(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector tri[3], - uint32_t triMask, - simdscalari const& primID, - simdscalari const& viewportIdx, - simdscalari const& rtIdx) -{ - BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>( - pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx); -} - -#if USE_SIMD16_FRONTEND -template <typename CT> -void SIMDCALL BinTriangles_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector tri[3], - uint32_t triMask, - simd16scalari const& primID, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx) -{ - BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>( - pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx); -} - -#endif -struct FEBinTrianglesChooser -{ - typedef PFN_PROCESS_PRIMS FuncType; - - template <typename... ArgsB> - static FuncType GetFunc() - { - return BinTriangles<ConservativeRastFETraits<ArgsB...>>; - } -}; - -// Selector for correct templated BinTrinagles function -PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative) -{ - return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative); -} - -#if USE_SIMD16_FRONTEND -struct FEBinTrianglesChooser_simd16 -{ - typedef PFN_PROCESS_PRIMS_SIMD16 FuncType; - - template <typename... ArgsB> - static FuncType GetFunc() - { - return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>; - } -}; - -// Selector for correct templated BinTrinagles function -PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative) -{ - return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative); -} - -#endif - -template <typename SIMD_T, uint32_t SIMD_WIDTH> -void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - Vec4<SIMD_T> prim[], - uint32_t primMask, - Integer<SIMD_T> const& primID, - Integer<SIMD_T> const& viewportIdx, - Integer<SIMD_T> const& rtIdx) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinPoints, pDC->drawId); - - Vec4<SIMD_T>& primVerts = prim[0]; - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx; - - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc( - 1, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - - // convert to fixed point - Integer<SIMD_T> vXi, vYi; - - vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x); - vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y); - - if (CanUseSimplePoints(pDC)) - { - // adjust for ymin-xmin rule - vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1)); - vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1)); - - // cull points off the ymin-xmin edge of the viewport - primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi)); - primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi)); - - // compute macro tile coordinates - Integer<SIMD_T> macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi); - Integer<SIMD_T> macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi); - - OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH]; - - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroX), macroX); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroY), macroY); - - // compute raster tile coordinates - Integer<SIMD_T> rasterX = - SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi); - Integer<SIMD_T> rasterY = - SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi); - - // compute raster tile relative x,y for coverage mask - Integer<SIMD_T> tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX); - Integer<SIMD_T> tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY); - - Integer<SIMD_T> tileRelativeX = - SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX); - Integer<SIMD_T> tileRelativeY = - SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY); - - OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH]; - OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH]; - - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeX), tileRelativeX); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeY), tileRelativeY); - - OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH]; - OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH]; - - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedX), tileAlignedX); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedY), tileAlignedY); - - OSALIGNSIMD16(float) aZ[SIMD_WIDTH]; - SIMD_T::store_ps(reinterpret_cast<float*>(aZ), primVerts.z); - - // store render target array index - const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx); - - uint32_t* pPrimID = (uint32_t*)&primID; - uint32_t primIndex = 0; - - const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; - - // scan remaining valid triangles and bin each separately - while (_BitScanForward((unsigned long*)&primIndex, primMask)) - { - uint32_t linkageCount = backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - TRIANGLE_WORK_DESC& desc = work.desc.tri; - - // points are always front facing - desc.triFlags.frontFacing = 1; - desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; - - work.pfnWork = RasterizeSimplePoint; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store attributes - float* pAttribs = - (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16); - desc.pAttribs = pAttribs; - desc.numAttribs = linkageCount; - - pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs); - - // store raster tile aligned x, y, perspective correct z - float* pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); - desc.pTriBuffer = pTriBuffer; - *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; - *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; - *pTriBuffer = aZ[primIndex]; - - uint32_t tX = aTileRelativeX[primIndex]; - uint32_t tY = aTileRelativeY[primIndex]; - - // pack the relative x,y into the coverageMask, the rasterizer will - // generate the true coverage mask from it - work.desc.tri.triFlags.coverageMask = tX | (tY << 4); - - // bin it - MacroTileMgr* pTileMgr = pDC->pTileMgr; -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); - } - - primMask &= ~(1 << primIndex); - } - } - else - { - // non simple points need to be potentially binned to multiple macro tiles - Float<SIMD_T> vPointSize; - - if (rastState.pointParam) - { - Vec4<SIMD_T> size[3]; - pa.Assemble(VERTEX_SGV_SLOT, size); - vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP]; - } - else - { - vPointSize = SIMD_T::set1_ps(rastState.pointSize); - } - - // bloat point to bbox - SIMDBBOX_T<SIMD_T> bbox; - - bbox.xmin = bbox.xmax = vXi; - bbox.ymin = bbox.ymax = vYi; - - Float<SIMD_T> vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f)); - Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth); - - bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi); - bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi); - bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi); - bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi); - - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge - // is exclusive. Gather the AOS effective scissor rects based on the per-prim VP index. - /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - { - Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax; - - if (pa.viewportArrayActive) - { - GatherScissors(&state.scissorsInFixedPoint[0], - pViewportIndex, - scisXmin, - scisYmin, - scisXmax, - scisYmax); - } - else // broadcast fast path for non-VPAI case. - { - scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax); - } - - bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin); - bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin); - bbox.xmax = - SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax); - bbox.ymax = - SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax); - } - - // Cull bloated points completely outside scissor - Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); - Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); - Integer<SIMD_T> maskOutsideScissorXY = - SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); - primMask = primMask & ~maskOutsideScissor; - - // Convert bbox to macrotile units. - bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin); - bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin); - bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax); - bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax); - - OSALIGNSIMD16(uint32_t) - aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax); - - // store render target array index - const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx); - - OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH]; - SIMD_T::store_ps(reinterpret_cast<float*>(aPointSize), vPointSize); - - uint32_t* pPrimID = (uint32_t*)&primID; - - OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH]; - OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH]; - OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH]; - - SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsX), primVerts.x); - SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsY), primVerts.y); - SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsZ), primVerts.z); - - // scan remaining valid prims and bin each separately - const SWR_BACKEND_STATE& backendState = state.backendState; - uint32_t primIndex; - while (_BitScanForward((unsigned long*)&primIndex, primMask)) - { - uint32_t linkageCount = backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - TRIANGLE_WORK_DESC& desc = work.desc.tri; - - desc.triFlags.frontFacing = 1; - desc.triFlags.pointSize = aPointSize[primIndex]; - desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; - - work.pfnWork = RasterizeTriPoint; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store active attribs - desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); - desc.numAttribs = linkageCount; - pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); - - // store point vertex data - float* pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); - desc.pTriBuffer = pTriBuffer; - *pTriBuffer++ = aPrimVertsX[primIndex]; - *pTriBuffer++ = aPrimVertsY[primIndex]; - *pTriBuffer = aPrimVertsZ[primIndex]; - - // store user clip distances - if (backendState.clipDistanceMask) - { - uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask); - desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); - float dists[8]; - float one = 1.0f; - ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists); - for (uint32_t i = 0; i < numClipDist; i++) - { - desc.pUserClipBuffer[3 * i + 0] = 0.0f; - desc.pUserClipBuffer[3 * i + 1] = 0.0f; - desc.pUserClipBuffer[3 * i + 2] = dists[i]; - } - } - - MacroTileMgr* pTileMgr = pDC->pTileMgr; - for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) - { - for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(x, y, &work); - } - } - } - - primMask &= ~(1 << primIndex); - } - } - - RDTSC_END(pDC->pContext->pBucketMgr, FEBinPoints, 1); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Bin SIMD points to the backend. Only supports point size of 1 -/// @param pDC - pointer to draw context. -/// @param pa - The primitive assembly object. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param tri - Contains point position data for SIMDs worth of points. -/// @param primID - Primitive ID for each point. -template <typename SIMD_T, uint32_t SIMD_WIDTH> -void BinPointsImpl(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - Vec4<SIMD_T> prim[3], - uint32_t primMask, - Integer<SIMD_T> const& primID, - Integer<SIMD_T> const& viewportIdx, - Integer<SIMD_T> const& rtIdx) -{ - const API_STATE& state = GetApiState(pDC); - const SWR_FRONTEND_STATE& feState = state.frontendState; - const SWR_RASTSTATE& rastState = state.rastState; - - if (!feState.vpTransformDisable) - { - // perspective divide - Float<SIMD_T> vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w); - - prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0); - prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0); - prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0); - - // viewport transform to screen coords - if (pa.viewportArrayActive) - { - viewportTransform<1>(prim, state.vpMatrices, viewportIdx); - } - else - { - viewportTransform<1>(prim, state.vpMatrices); - } - } - - Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation); - - prim[0].x = SIMD_T::add_ps(prim[0].x, offset); - prim[0].y = SIMD_T::add_ps(prim[0].y, offset); - - BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>( - pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); -} - -void BinPoints(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prim[3], - uint32_t primMask, - simdscalari const& primID, - simdscalari const& viewportIdx, - simdscalari const& rtIdx) -{ - BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>( - pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); -} - -#if USE_SIMD16_FRONTEND -void SIMDCALL BinPoints_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prim[3], - uint32_t primMask, - simd16scalari const& primID, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx) -{ - BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>( - pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); -} - -#endif -////////////////////////////////////////////////////////////////////////// -/// @brief Bin SIMD lines to the backend. -/// @param pDC - pointer to draw context. -/// @param pa - The primitive assembly object. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param tri - Contains line position data for SIMDs worth of points. -/// @param primID - Primitive ID for each line. -/// @param viewportIdx - Viewport Array Index for each line. -template <typename SIMD_T, uint32_t SIMD_WIDTH> -void BinPostSetupLinesImpl(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - Vec4<SIMD_T> prim[], - Float<SIMD_T> recipW[], - uint32_t primMask, - Integer<SIMD_T> const& primID, - Integer<SIMD_T> const& viewportIdx, - Integer<SIMD_T> const& rtIdx) -{ - const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx); - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinLines, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc( - 2, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - - Float<SIMD_T>& vRecipW0 = recipW[0]; - Float<SIMD_T>& vRecipW1 = recipW[1]; - - // convert to fixed point - Integer<SIMD_T> vXi[2], vYi[2]; - - vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x); - vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y); - vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x); - vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y); - - // compute x-major vs y-major mask - Integer<SIMD_T> xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1])); - Integer<SIMD_T> yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1])); - Float<SIMD_T> vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength)); - uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask); - - // cull zero-length lines - Integer<SIMD_T> vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si()); - vZeroLengthMask = - SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si())); - - primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask)); - - uint32_t* pPrimID = (uint32_t*)&primID; - const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx; - - // Calc bounding box of lines - SIMDBBOX_T<SIMD_T> bbox; - bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]); - bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]); - bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]); - bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]); - - // bloat bbox by line width along minor axis - Float<SIMD_T> vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f); - Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth); - - SIMDBBOX_T<SIMD_T> bloatBox; - - bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi); - bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi); - bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi); - bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi); - - bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask); - bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask); - bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask); - bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); - - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is - // exclusive. - { - Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax; - - if (pa.viewportArrayActive) - { - GatherScissors(&state.scissorsInFixedPoint[0], - pViewportIndex, - scisXmin, - scisYmin, - scisXmax, - scisYmax); - } - else // broadcast fast path for non-VPAI case. - { - scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax); - } - - bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin); - bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin); - bbox.xmax = - SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax); - bbox.ymax = - SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax); - } - - // Cull prims completely outside scissor - { - Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); - Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); - Integer<SIMD_T> maskOutsideScissorXY = - SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); - primMask = primMask & ~maskOutsideScissor; - } - - // transpose verts needed for backend - /// @todo modify BE to take non-transformed verts - OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH]; - OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH]; - OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH]; - OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH]; - - if (!primMask) - { - goto endBinLines; - } - - // Convert triangle bbox to macrotile units. - bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin); - bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin); - bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax); - bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax); - - OSALIGNSIMD16(uint32_t) - aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin); - SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax); - - TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps()); - TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps()); - TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps()); - TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps()); - - // scan remaining valid prims and bin each separately - unsigned long primIndex; - while (_BitScanForward(&primIndex, primMask)) - { - uint32_t linkageCount = state.backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - TRIANGLE_WORK_DESC& desc = work.desc.tri; - - desc.triFlags.frontFacing = 1; - desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; - desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; - - work.pfnWork = RasterizeLine; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store active attribs - desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); - desc.numAttribs = linkageCount; - pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); - - // store line vertex data - desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - - _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); - _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); - _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); - _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); - - // store user clip distances - if (state.backendState.clipDistanceMask) - { - uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask); - desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); - ProcessUserClipDist<2>( - state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer); - } - - MacroTileMgr* pTileMgr = pDC->pTileMgr; - for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) - { - for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(x, y, &work); - } - } - } - - primMask &= ~(1 << primIndex); - } - -endBinLines: - - RDTSC_END(pDC->pContext->pBucketMgr, FEBinLines, 1); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Bin SIMD lines to the backend. -/// @param pDC - pointer to draw context. -/// @param pa - The primitive assembly object. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param tri - Contains line position data for SIMDs worth of points. -/// @param primID - Primitive ID for each line. -/// @param viewportIdx - Viewport Array Index for each line. -template <typename SIMD_T, uint32_t SIMD_WIDTH> -void SIMDCALL BinLinesImpl(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - Vec4<SIMD_T> prim[3], - uint32_t primMask, - Integer<SIMD_T> const& primID, - Integer<SIMD_T> const& viewportIdx, - Integer<SIMD_T> const& rtIdx) -{ - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const SWR_FRONTEND_STATE& feState = state.frontendState; - - Float<SIMD_T> vRecipW[2] = {SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f)}; - - if (!feState.vpTransformDisable) - { - // perspective divide - vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w); - vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w); - - prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]); - prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]); - - prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]); - prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]); - - prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]); - prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]); - - // viewport transform to screen coords - if (pa.viewportArrayActive) - { - viewportTransform<2>(prim, state.vpMatrices, viewportIdx); - } - else - { - viewportTransform<2>(prim, state.vpMatrices); - } - } - - // adjust for pixel center location - Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation); - - prim[0].x = SIMD_T::add_ps(prim[0].x, offset); - prim[0].y = SIMD_T::add_ps(prim[0].y, offset); - - prim[1].x = SIMD_T::add_ps(prim[1].x, offset); - prim[1].y = SIMD_T::add_ps(prim[1].y, offset); - - BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>( - pDC, pa, workerId, prim, vRecipW, primMask, primID, viewportIdx, rtIdx); -} - -void BinLines(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prim[], - uint32_t primMask, - simdscalari const& primID, - simdscalari const& viewportIdx, - simdscalari const& rtIdx) -{ - BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>( - pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); -} - -#if USE_SIMD16_FRONTEND -void SIMDCALL BinLines_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prim[3], - uint32_t primMask, - simd16scalari const& primID, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx) -{ - BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>( - pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx); -} - -#endif diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h b/src/gallium/drivers/swr/rasterizer/core/binner.h deleted file mode 100644 index 63be8f67cbf..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/binner.h +++ /dev/null @@ -1,254 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file binner.h - * - * @brief Declaration for the macrotile binner - * - ******************************************************************************/ -#include "state.h" -#include "conservativeRast.h" -#include "utils.h" -////////////////////////////////////////////////////////////////////////// -/// @brief Offsets added to post-viewport vertex positions based on -/// raster state. -/// -/// Can't use templated variable because we must stick with C++11 features. -/// Template variables were introduced with C++14 -template <typename SIMD_T> -struct SwrPixelOffsets -{ -public: - INLINE static Float<SIMD_T> GetOffset(uint32_t loc) - { - SWR_ASSERT(loc <= 1); - - return SIMD_T::set1_ps(loc ? 0.5f : 0.0f); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Convert the X,Y coords of a triangle to the requested Fixed -/// Point precision from FP32. -template <typename SIMD_T, typename PT = FixedPointTraits<Fixed_16_8>> -INLINE Integer<SIMD_T> fpToFixedPointVertical(const Float<SIMD_T>& vIn) -{ - return SIMD_T::cvtps_epi32(SIMD_T::mul_ps(vIn, SIMD_T::set1_ps(PT::ScaleT::value))); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Helper function to set the X,Y coords of a triangle to the -/// requested Fixed Point precision from FP32. -/// @param tri: simdvector[3] of FP triangle verts -/// @param vXi: fixed point X coords of tri verts -/// @param vYi: fixed point Y coords of tri verts -template <typename SIMD_T> -INLINE static void -FPToFixedPoint(const Vec4<SIMD_T>* const tri, Integer<SIMD_T> (&vXi)[3], Integer<SIMD_T> (&vYi)[3]) -{ - vXi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].x); - vYi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].y); - vXi[1] = fpToFixedPointVertical<SIMD_T>(tri[1].x); - vYi[1] = fpToFixedPointVertical<SIMD_T>(tri[1].y); - vXi[2] = fpToFixedPointVertical<SIMD_T>(tri[2].x); - vYi[2] = fpToFixedPointVertical<SIMD_T>(tri[2].y); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Calculate bounding box for current triangle -/// @tparam CT: ConservativeRastFETraits type -/// @param vX: fixed point X position for triangle verts -/// @param vY: fixed point Y position for triangle verts -/// @param bbox: fixed point bbox -/// *Note*: expects vX, vY to be in the correct precision for the type -/// of rasterization. This avoids unnecessary FP->fixed conversions. -template <typename SIMD_T, typename CT> -INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T> (&vX)[3], - const Integer<SIMD_T> (&vY)[3], - SIMDBBOX_T<SIMD_T>& bbox) -{ - Integer<SIMD_T> vMinX = vX[0]; - - vMinX = SIMD_T::min_epi32(vMinX, vX[1]); - vMinX = SIMD_T::min_epi32(vMinX, vX[2]); - - Integer<SIMD_T> vMaxX = vX[0]; - - vMaxX = SIMD_T::max_epi32(vMaxX, vX[1]); - vMaxX = SIMD_T::max_epi32(vMaxX, vX[2]); - - Integer<SIMD_T> vMinY = vY[0]; - - vMinY = SIMD_T::min_epi32(vMinY, vY[1]); - vMinY = SIMD_T::min_epi32(vMinY, vY[2]); - - Integer<SIMD_T> vMaxY = vY[0]; - - vMaxY = SIMD_T::max_epi32(vMaxY, vY[1]); - vMaxY = SIMD_T::max_epi32(vMaxY, vY[2]); - - if (CT::BoundingBoxOffsetT::value != 0) - { - /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative - /// rasterization expand bbox by 1/256; coverage will be correctly handled in the - /// rasterizer. - - const Integer<SIMD_T> value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value); - - vMinX = SIMD_T::sub_epi32(vMinX, value); - vMaxX = SIMD_T::add_epi32(vMaxX, value); - vMinY = SIMD_T::sub_epi32(vMinY, value); - vMaxY = SIMD_T::add_epi32(vMaxY, value); - } - - bbox.xmin = vMinX; - bbox.xmax = vMaxX; - bbox.ymin = vMinY; - bbox.ymax = vMaxY; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Gather scissor rect data based on per-prim viewport indices. -/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point. -/// @param pViewportIndex - array of per-primitive viewport indexes. -/// @param scisXmin - output vector of per-primitive scissor rect Xmin data. -/// @param scisYmin - output vector of per-primitive scissor rect Ymin data. -/// @param scisXmax - output vector of per-primitive scissor rect Xmax data. -/// @param scisYmax - output vector of per-primitive scissor rect Ymax data. -// -/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. -static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint, - const uint32_t* pViewportIndex, - simdscalari& scisXmin, - simdscalari& scisYmin, - simdscalari& scisXmax, - simdscalari& scisYmax) -{ - scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmin, - pScissorsInFixedPoint[pViewportIndex[6]].xmin, - pScissorsInFixedPoint[pViewportIndex[5]].xmin, - pScissorsInFixedPoint[pViewportIndex[4]].xmin, - pScissorsInFixedPoint[pViewportIndex[3]].xmin, - pScissorsInFixedPoint[pViewportIndex[2]].xmin, - pScissorsInFixedPoint[pViewportIndex[1]].xmin, - pScissorsInFixedPoint[pViewportIndex[0]].xmin); - scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymin, - pScissorsInFixedPoint[pViewportIndex[6]].ymin, - pScissorsInFixedPoint[pViewportIndex[5]].ymin, - pScissorsInFixedPoint[pViewportIndex[4]].ymin, - pScissorsInFixedPoint[pViewportIndex[3]].ymin, - pScissorsInFixedPoint[pViewportIndex[2]].ymin, - pScissorsInFixedPoint[pViewportIndex[1]].ymin, - pScissorsInFixedPoint[pViewportIndex[0]].ymin); - scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmax, - pScissorsInFixedPoint[pViewportIndex[6]].xmax, - pScissorsInFixedPoint[pViewportIndex[5]].xmax, - pScissorsInFixedPoint[pViewportIndex[4]].xmax, - pScissorsInFixedPoint[pViewportIndex[3]].xmax, - pScissorsInFixedPoint[pViewportIndex[2]].xmax, - pScissorsInFixedPoint[pViewportIndex[1]].xmax, - pScissorsInFixedPoint[pViewportIndex[0]].xmax); - scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymax, - pScissorsInFixedPoint[pViewportIndex[6]].ymax, - pScissorsInFixedPoint[pViewportIndex[5]].ymax, - pScissorsInFixedPoint[pViewportIndex[4]].ymax, - pScissorsInFixedPoint[pViewportIndex[3]].ymax, - pScissorsInFixedPoint[pViewportIndex[2]].ymax, - pScissorsInFixedPoint[pViewportIndex[1]].ymax, - pScissorsInFixedPoint[pViewportIndex[0]].ymax); -} - -static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint, - const uint32_t* pViewportIndex, - simd16scalari& scisXmin, - simd16scalari& scisYmin, - simd16scalari& scisXmax, - simd16scalari& scisYmax) -{ - scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmin, - pScissorsInFixedPoint[pViewportIndex[14]].xmin, - pScissorsInFixedPoint[pViewportIndex[13]].xmin, - pScissorsInFixedPoint[pViewportIndex[12]].xmin, - pScissorsInFixedPoint[pViewportIndex[11]].xmin, - pScissorsInFixedPoint[pViewportIndex[10]].xmin, - pScissorsInFixedPoint[pViewportIndex[9]].xmin, - pScissorsInFixedPoint[pViewportIndex[8]].xmin, - pScissorsInFixedPoint[pViewportIndex[7]].xmin, - pScissorsInFixedPoint[pViewportIndex[6]].xmin, - pScissorsInFixedPoint[pViewportIndex[5]].xmin, - pScissorsInFixedPoint[pViewportIndex[4]].xmin, - pScissorsInFixedPoint[pViewportIndex[3]].xmin, - pScissorsInFixedPoint[pViewportIndex[2]].xmin, - pScissorsInFixedPoint[pViewportIndex[1]].xmin, - pScissorsInFixedPoint[pViewportIndex[0]].xmin); - - scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymin, - pScissorsInFixedPoint[pViewportIndex[14]].ymin, - pScissorsInFixedPoint[pViewportIndex[13]].ymin, - pScissorsInFixedPoint[pViewportIndex[12]].ymin, - pScissorsInFixedPoint[pViewportIndex[11]].ymin, - pScissorsInFixedPoint[pViewportIndex[10]].ymin, - pScissorsInFixedPoint[pViewportIndex[9]].ymin, - pScissorsInFixedPoint[pViewportIndex[8]].ymin, - pScissorsInFixedPoint[pViewportIndex[7]].ymin, - pScissorsInFixedPoint[pViewportIndex[6]].ymin, - pScissorsInFixedPoint[pViewportIndex[5]].ymin, - pScissorsInFixedPoint[pViewportIndex[4]].ymin, - pScissorsInFixedPoint[pViewportIndex[3]].ymin, - pScissorsInFixedPoint[pViewportIndex[2]].ymin, - pScissorsInFixedPoint[pViewportIndex[1]].ymin, - pScissorsInFixedPoint[pViewportIndex[0]].ymin); - - scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmax, - pScissorsInFixedPoint[pViewportIndex[14]].xmax, - pScissorsInFixedPoint[pViewportIndex[13]].xmax, - pScissorsInFixedPoint[pViewportIndex[12]].xmax, - pScissorsInFixedPoint[pViewportIndex[11]].xmax, - pScissorsInFixedPoint[pViewportIndex[10]].xmax, - pScissorsInFixedPoint[pViewportIndex[9]].xmax, - pScissorsInFixedPoint[pViewportIndex[8]].xmax, - pScissorsInFixedPoint[pViewportIndex[7]].xmax, - pScissorsInFixedPoint[pViewportIndex[6]].xmax, - pScissorsInFixedPoint[pViewportIndex[5]].xmax, - pScissorsInFixedPoint[pViewportIndex[4]].xmax, - pScissorsInFixedPoint[pViewportIndex[3]].xmax, - pScissorsInFixedPoint[pViewportIndex[2]].xmax, - pScissorsInFixedPoint[pViewportIndex[1]].xmax, - pScissorsInFixedPoint[pViewportIndex[0]].xmax); - - scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymax, - pScissorsInFixedPoint[pViewportIndex[14]].ymax, - pScissorsInFixedPoint[pViewportIndex[13]].ymax, - pScissorsInFixedPoint[pViewportIndex[12]].ymax, - pScissorsInFixedPoint[pViewportIndex[11]].ymax, - pScissorsInFixedPoint[pViewportIndex[10]].ymax, - pScissorsInFixedPoint[pViewportIndex[9]].ymax, - pScissorsInFixedPoint[pViewportIndex[8]].ymax, - pScissorsInFixedPoint[pViewportIndex[7]].ymax, - pScissorsInFixedPoint[pViewportIndex[6]].ymax, - pScissorsInFixedPoint[pViewportIndex[5]].ymax, - pScissorsInFixedPoint[pViewportIndex[4]].ymax, - pScissorsInFixedPoint[pViewportIndex[3]].ymax, - pScissorsInFixedPoint[pViewportIndex[2]].ymax, - pScissorsInFixedPoint[pViewportIndex[1]].ymax, - pScissorsInFixedPoint[pViewportIndex[0]].ymax); -}
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h deleted file mode 100644 index 7b2f77985f8..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/blend.h +++ /dev/null @@ -1,348 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file blend.cpp - * - * @brief Implementation for blending operations. - * - ******************************************************************************/ -#include "state.h" - -template <bool Color, bool Alpha> -INLINE void GenerateBlendFactor(SWR_BLEND_FACTOR func, - simdvector& constantColor, - simdvector& src, - simdvector& src1, - simdvector& dst, - simdvector& out) -{ - simdvector result; - - switch (func) - { - case BLENDFACTOR_ZERO: - result.x = _simd_setzero_ps(); - result.y = _simd_setzero_ps(); - result.z = _simd_setzero_ps(); - result.w = _simd_setzero_ps(); - break; - - case BLENDFACTOR_ONE: - result.x = _simd_set1_ps(1.0); - result.y = _simd_set1_ps(1.0); - result.z = _simd_set1_ps(1.0); - result.w = _simd_set1_ps(1.0); - break; - - case BLENDFACTOR_SRC_COLOR: - result = src; - break; - - case BLENDFACTOR_DST_COLOR: - result = dst; - break; - - case BLENDFACTOR_INV_SRC_COLOR: - result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x); - result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y); - result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z); - result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w); - break; - - case BLENDFACTOR_INV_DST_COLOR: - result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x); - result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y); - result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z); - result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w); - break; - - case BLENDFACTOR_SRC_ALPHA: - result.x = src.w; - result.y = src.w; - result.z = src.w; - result.w = src.w; - break; - - case BLENDFACTOR_INV_SRC_ALPHA: - { - simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w); - result.x = oneMinusSrcA; - result.y = oneMinusSrcA; - result.z = oneMinusSrcA; - result.w = oneMinusSrcA; - break; - } - - case BLENDFACTOR_DST_ALPHA: - result.x = dst.w; - result.y = dst.w; - result.z = dst.w; - result.w = dst.w; - break; - - case BLENDFACTOR_INV_DST_ALPHA: - { - simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w); - result.x = oneMinusDstA; - result.y = oneMinusDstA; - result.z = oneMinusDstA; - result.w = oneMinusDstA; - break; - } - - case BLENDFACTOR_SRC_ALPHA_SATURATE: - { - simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w)); - result.x = sat; - result.y = sat; - result.z = sat; - result.w = _simd_set1_ps(1.0); - break; - } - - case BLENDFACTOR_CONST_COLOR: - result.x = constantColor[0]; - result.y = constantColor[1]; - result.z = constantColor[2]; - result.w = constantColor[3]; - break; - - case BLENDFACTOR_CONST_ALPHA: - result.x = result.y = result.z = result.w = constantColor[3]; - break; - - case BLENDFACTOR_INV_CONST_COLOR: - { - result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]); - result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]); - result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]); - result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]); - break; - } - - case BLENDFACTOR_INV_CONST_ALPHA: - { - result.x = result.y = result.z = result.w = - _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]); - break; - } - - case BLENDFACTOR_SRC1_COLOR: - result.x = src1.x; - result.y = src1.y; - result.z = src1.z; - result.w = src1.w; - break; - - case BLENDFACTOR_SRC1_ALPHA: - result.x = result.y = result.z = result.w = src1.w; - break; - - case BLENDFACTOR_INV_SRC1_COLOR: - result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x); - result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y); - result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z); - result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w); - break; - - case BLENDFACTOR_INV_SRC1_ALPHA: - result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w); - break; - - default: - SWR_INVALID("Unimplemented blend factor: %d", func); - } - - if (Color) - { - out.x = result.x; - out.y = result.y; - out.z = result.z; - } - if (Alpha) - { - out.w = result.w; - } -} - -template <bool Color, bool Alpha> -INLINE void BlendFunc(SWR_BLEND_OP blendOp, - simdvector& src, - simdvector& srcFactor, - simdvector& dst, - simdvector& dstFactor, - simdvector& out) -{ - simdvector result; - - switch (blendOp) - { - case BLENDOP_ADD: - result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x)); - result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y)); - result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z)); - result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w)); - break; - - case BLENDOP_SUBTRACT: - result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x)); - result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y)); - result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z)); - result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w)); - break; - - case BLENDOP_REVSUBTRACT: - result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x)); - result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y)); - result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z)); - result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w)); - break; - - case BLENDOP_MIN: - result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x)); - result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y)); - result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z)); - result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w)); - break; - - case BLENDOP_MAX: - result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x)); - result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y)); - result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z)); - result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w)); - break; - - default: - SWR_INVALID("Unimplemented blend function: %d", blendOp); - } - - if (Color) - { - out.x = result.x; - out.y = result.y; - out.z = result.z; - } - if (Alpha) - { - out.w = result.w; - } -} - -template <SWR_TYPE type> -INLINE void Clamp(simdvector& src) -{ - switch (type) - { - case SWR_TYPE_FLOAT: - break; - - case SWR_TYPE_UNORM: - src.x = _simd_max_ps(src.x, _simd_setzero_ps()); - src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f)); - - src.y = _simd_max_ps(src.y, _simd_setzero_ps()); - src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f)); - - src.z = _simd_max_ps(src.z, _simd_setzero_ps()); - src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f)); - - src.w = _simd_max_ps(src.w, _simd_setzero_ps()); - src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f)); - break; - - case SWR_TYPE_SNORM: - src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f)); - src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f)); - - src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f)); - src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f)); - - src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f)); - src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f)); - - src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f)); - src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f)); - break; - - default: - SWR_INVALID("Unimplemented clamp: %d", type); - break; - } -} - -template <SWR_TYPE type> -void Blend(const SWR_BLEND_STATE* pBlendState, - const SWR_RENDER_TARGET_BLEND_STATE* pState, - simdvector& src, - simdvector& src1, - uint8_t* pDst, - simdvector& result) -{ - // load render target - simdvector dst; - LoadSOA<KNOB_COLOR_HOT_TILE_FORMAT>(pDst, dst); - - simdvector constColor; - constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]); - constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]); - constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]); - constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]); - - // clamp src/dst/constant - Clamp<type>(src); - Clamp<type>(src1); - Clamp<type>(dst); - Clamp<type>(constColor); - - simdvector srcFactor, dstFactor; - if (pBlendState->independentAlphaBlendEnable) - { - GenerateBlendFactor<true, false>( - (SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); - GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, - constColor, - src, - src1, - dst, - srcFactor); - - GenerateBlendFactor<true, false>( - (SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); - GenerateBlendFactor<false, true>( - (SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor); - - BlendFunc<true, false>( - (SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); - BlendFunc<false, true>( - (SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result); - } - else - { - GenerateBlendFactor<true, true>( - (SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); - GenerateBlendFactor<true, true>( - (SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); - - BlendFunc<true, true>( - (SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); - } -} diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp deleted file mode 100644 index c399caf239b..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ /dev/null @@ -1,336 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file clip.cpp - * - * @brief Implementation for clipping - * - ******************************************************************************/ - -#include <assert.h> - -#include "common/os.h" -#include "core/clip.h" - -float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1) -{ - return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1)); -} - -template <SWR_CLIPCODES ClippingPlane> -inline void intersect( - int s, // index to first edge vertex v0 in pInPts. - int p, // index to second edge vertex v1 in pInPts. - const float* pInPts, // array of all the input positions. - const float* pInAttribs, // array of all attributes for all vertex. All the attributes for each - // vertex is contiguous. - int numInAttribs, // number of attributes per vertex. - int i, // output index. - float* pOutPts, // array of output positions. We'll write our new intersection point at i*4. - float* pOutAttribs) // array of output attributes. We'll write our new attributes at - // i*numInAttribs. -{ - float t; - - // Find the parameter of the intersection. - // t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc. - const float* v1 = &pInPts[s * 4]; - const float* v2 = &pInPts[p * 4]; - - switch (ClippingPlane) - { - case FRUSTUM_LEFT: - t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); - break; - case FRUSTUM_RIGHT: - t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); - break; - case FRUSTUM_TOP: - t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); - break; - case FRUSTUM_BOTTOM: - t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); - break; - case FRUSTUM_NEAR: - t = ComputeInterpFactor(v1[2], v2[2]); - break; - case FRUSTUM_FAR: - t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); - break; - default: - SWR_INVALID("invalid clipping plane: %d", ClippingPlane); - }; - - const float* a1 = &pInAttribs[s * numInAttribs]; - const float* a2 = &pInAttribs[p * numInAttribs]; - - float* pOutP = &pOutPts[i * 4]; - float* pOutA = &pOutAttribs[i * numInAttribs]; - - // Interpolate new position. - for (int j = 0; j < 4; ++j) - { - pOutP[j] = v1[j] + (v2[j] - v1[j]) * t; - } - - // Interpolate Attributes - for (int attr = 0; attr < numInAttribs; ++attr) - { - pOutA[attr] = a1[attr] + (a2[attr] - a1[attr]) * t; - } -} - -// Checks whether vertex v lies inside clipping plane -// in homogenous coords check -w < {x,y,z} < w; -// -template <SWR_CLIPCODES ClippingPlane> -inline int inside(const float v[4]) -{ - switch (ClippingPlane) - { - case FRUSTUM_LEFT: - return (v[0] >= -v[3]); - case FRUSTUM_RIGHT: - return (v[0] <= v[3]); - case FRUSTUM_TOP: - return (v[1] >= -v[3]); - case FRUSTUM_BOTTOM: - return (v[1] <= v[3]); - case FRUSTUM_NEAR: - return (v[2] >= 0.0f); - case FRUSTUM_FAR: - return (v[2] <= v[3]); - default: - SWR_INVALID("invalid clipping plane: %d", ClippingPlane); - return 0; - } -} - -// Clips a polygon in homogenous coordinates to a particular clipping plane. -// Takes in vertices of the polygon (InPts) and the clipping plane -// Puts the vertices of the clipped polygon in OutPts -// Returns number of points in clipped polygon -// -template <SWR_CLIPCODES ClippingPlane> -int ClipTriToPlane(const float* pInPts, - int numInPts, - const float* pInAttribs, - int numInAttribs, - float* pOutPts, - float* pOutAttribs) -{ - int i = 0; // index number of OutPts, # of vertices in OutPts = i div 4; - - for (int j = 0; j < numInPts; ++j) - { - int s = j; - int p = (j + 1) % numInPts; - - int s_in = inside<ClippingPlane>(&pInPts[s * 4]); - int p_in = inside<ClippingPlane>(&pInPts[p * 4]); - - // test if vertex is to be added to output vertices - if (s_in != p_in) // edge crosses clipping plane - { - // find point of intersection - intersect<ClippingPlane>( - s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs); - i++; - } - if (p_in) // 2nd vertex is inside clipping volume, add it to output - { - // Copy 2nd vertex position of edge over to output. - for (int k = 0; k < 4; ++k) - { - pOutPts[i * 4 + k] = pInPts[p * 4 + k]; - } - // Copy 2nd vertex attributes of edge over to output. - for (int attr = 0; attr < numInAttribs; ++attr) - { - pOutAttribs[i * numInAttribs + attr] = pInAttribs[p * numInAttribs + attr]; - } - i++; - } - // edge does not cross clipping plane and vertex outside clipping volume - // => do not add vertex - } - return i; -} - -void ClipRectangles(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[], - uint32_t primMask, - simdscalari const& primId, - simdscalari const& viewportIdx, - simdscalari const& rtIdx) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipRectangles, pDC->drawId); - Clipper<SIMD256, 3> clipper(workerId, pDC); - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx); - RDTSC_END(pDC->pContext->pBucketMgr, FEClipRectangles, 1); -} - -void ClipTriangles(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[], - uint32_t primMask, - simdscalari const& primId, - simdscalari const& viewportIdx, - simdscalari const& rtIdx) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipTriangles, pDC->drawId); - Clipper<SIMD256, 3> clipper(workerId, pDC); - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx); - RDTSC_END(pDC->pContext->pBucketMgr, FEClipTriangles, 1); -} - -void ClipLines(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[], - uint32_t primMask, - simdscalari const& primId, - simdscalari const& viewportIdx, - simdscalari const& rtIdx) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipLines, pDC->drawId); - Clipper<SIMD256, 2> clipper(workerId, pDC); - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx); - RDTSC_END(pDC->pContext->pBucketMgr, FEClipLines, 1); -} - -void ClipPoints(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[], - uint32_t primMask, - simdscalari const& primId, - simdscalari const& viewportIdx, - simdscalari const& rtIdx) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipPoints, pDC->drawId); - Clipper<SIMD256, 1> clipper(workerId, pDC); - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx); - RDTSC_END(pDC->pContext->pBucketMgr, FEClipPoints, 1); -} - -#if USE_SIMD16_FRONTEND -void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[], - uint32_t primMask, - simd16scalari const& primId, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipRectangles, pDC->drawId); - - enum - { - VERTS_PER_PRIM = 3 - }; - - Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC); - - pa.useAlternateOffset = false; - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx); - - RDTSC_END(pDC->pContext->pBucketMgr, FEClipRectangles, 1); -} - -void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[], - uint32_t primMask, - simd16scalari const& primId, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipTriangles, pDC->drawId); - - enum - { - VERTS_PER_PRIM = 3 - }; - - Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC); - - pa.useAlternateOffset = false; - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx); - - RDTSC_END(pDC->pContext->pBucketMgr, FEClipTriangles, 1); -} - -void SIMDCALL ClipLines_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[], - uint32_t primMask, - simd16scalari const& primId, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipLines, pDC->drawId); - - enum - { - VERTS_PER_PRIM = 2 - }; - - Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC); - - pa.useAlternateOffset = false; - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx); - - RDTSC_END(pDC->pContext->pBucketMgr, FEClipLines, 1); -} - -void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[], - uint32_t primMask, - simd16scalari const& primId, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipPoints, pDC->drawId); - - enum - { - VERTS_PER_PRIM = 1 - }; - - Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC); - - pa.useAlternateOffset = false; - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx); - - RDTSC_END(pDC->pContext->pBucketMgr, FEClipPoints, 1); -} - -#endif diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h deleted file mode 100644 index d7186ca10b1..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ /dev/null @@ -1,1361 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file clip.h - * - * @brief Definitions for clipping - * - ******************************************************************************/ -#pragma once - -#include "common/simdintrin.h" -#include "core/context.h" -#include "core/pa.h" -#include "rdtsc_core.h" - -enum SWR_CLIPCODES -{ -// Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. -// Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, -// rather than intersection, of clipcodes. -#define CLIPCODE_SHIFT 23 - FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT), - FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT), - FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT), - FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT), - - FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT), - FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT), - - NEGW = (0x40 << CLIPCODE_SHIFT), - - GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1), - GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2), - GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4), - GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8) -}; - -#define GUARDBAND_CLIP_MASK \ - (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \ - GUARDBAND_BOTTOM | NEGW) -#define FRUSTUM_CLIP_MASK \ - (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM) - -template <typename SIMD_T> -void ComputeClipCodes(const API_STATE& state, - const Vec4<SIMD_T>& vertex, - Float<SIMD_T>& clipCodes, - Integer<SIMD_T> const& viewportIndexes) -{ - clipCodes = SIMD_T::setzero_ps(); - - // -w - Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w, SIMD_T::set1_ps(-1.0f)); - - // FRUSTUM_LEFT - Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW); - clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT))); - - // FRUSTUM_TOP - vRes = SIMD_T::cmplt_ps(vertex.y, vNegW); - clipCodes = SIMD_T::or_ps( - clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP)))); - - // FRUSTUM_RIGHT - vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w); - clipCodes = SIMD_T::or_ps( - clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT)))); - - // FRUSTUM_BOTTOM - vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w); - clipCodes = SIMD_T::or_ps( - clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM)))); - - if (state.rastState.depthClipEnable) - { - // FRUSTUM_NEAR - // DX clips depth [0..w], GL clips [-w..w] - if (state.rastState.clipHalfZ) - { - vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps()); - } - else - { - vRes = SIMD_T::cmplt_ps(vertex.z, vNegW); - } - clipCodes = SIMD_T::or_ps( - clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR)))); - - // FRUSTUM_FAR - vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w); - clipCodes = SIMD_T::or_ps( - clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR)))); - } - - // NEGW - vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps()); - clipCodes = - SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW)))); - - // GUARDBAND_LEFT - Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW, - SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>( - &state.gbState.left[0], viewportIndexes)); - vRes = SIMD_T::cmplt_ps(vertex.x, gbMult); - clipCodes = SIMD_T::or_ps( - clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT)))); - - // GUARDBAND_TOP - gbMult = SIMD_T::mul_ps(vNegW, - SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>( - &state.gbState.top[0], viewportIndexes)); - vRes = SIMD_T::cmplt_ps(vertex.y, gbMult); - clipCodes = SIMD_T::or_ps( - clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP)))); - - // GUARDBAND_RIGHT - gbMult = SIMD_T::mul_ps(vertex.w, - SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>( - &state.gbState.right[0], viewportIndexes)); - vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult); - clipCodes = SIMD_T::or_ps( - clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT)))); - - // GUARDBAND_BOTTOM - gbMult = SIMD_T::mul_ps(vertex.w, - SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>( - &state.gbState.bottom[0], viewportIndexes)); - vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult); - clipCodes = SIMD_T::or_ps( - clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM)))); -} - -template <typename SIMD_T> -struct BinnerChooser -{ -}; - -template <> -struct BinnerChooser<SIMD256> -{ - PFN_PROCESS_PRIMS pfnBinFunc; - - BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast) - : - pfnBinFunc(nullptr) - { - if (numVertsPerPrim == 3) - { - pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0); - - } - else if (numVertsPerPrim == 2) - { - pfnBinFunc = BinLines; - } - else - { - SWR_ASSERT(0 && "Unexpected points in clipper."); - } - } - - BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast) - : - pfnBinFunc(nullptr) - { - switch (topology) - { - case TOP_POINT_LIST: - pfnBinFunc = BinPoints; - break; - case TOP_LINE_LIST: - case TOP_LINE_STRIP: - case TOP_LINE_LOOP: - case TOP_LINE_LIST_ADJ: - case TOP_LISTSTRIP_ADJ: - pfnBinFunc = BinLines; - break; - default: - pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0); - break; - }; - } - - void BinFunc(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - SIMD256::Vec4 prims[], - uint32_t primMask, - SIMD256::Integer const& primID, - SIMD256::Integer& viewportIdx, - SIMD256::Integer& rtIdx) - { - SWR_ASSERT(pfnBinFunc != nullptr); - - pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx); - } -}; - -#if USE_SIMD16_FRONTEND -template <> -struct BinnerChooser<SIMD512> -{ - PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc; - - BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast) - : - pfnBinFunc(nullptr) - { - if (numVertsPerPrim == 3) - { - pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0); - - } - else if (numVertsPerPrim == 2) - { - pfnBinFunc = BinLines_simd16; - } - else - { - SWR_ASSERT(0 && "Unexpected points in clipper."); - } - } - - BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast) - : - pfnBinFunc(nullptr) - { - switch (topology) - { - case TOP_POINT_LIST: - pfnBinFunc = BinPoints_simd16; - break; - case TOP_LINE_LIST: - case TOP_LINE_STRIP: - case TOP_LINE_LOOP: - case TOP_LINE_LIST_ADJ: - case TOP_LISTSTRIP_ADJ: - pfnBinFunc = BinLines_simd16; - break; - default: - pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0); - break; - }; - } - - void BinFunc(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - SIMD512::Vec4 prims[], - uint32_t primMask, - SIMD512::Integer const& primID, - SIMD512::Integer& viewportIdx, - SIMD512::Integer& rtIdx) - { - SWR_ASSERT(pfnBinFunc != nullptr); - - pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx); - } -}; - -#endif -template <typename SIMD_T> -struct SimdHelper -{ -}; - -template <> -struct SimdHelper<SIMD256> -{ - static SIMD256::Float insert_lo_ps(SIMD256::Float a) { return a; } - - static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b) - { - return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b)); - } -}; - -#if USE_SIMD16_FRONTEND -template <> -struct SimdHelper<SIMD512> -{ - static SIMD512::Float insert_lo_ps(SIMD256::Float a) - { - return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a); - } - - static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b) - { - return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b); - } -}; -#endif - -template <typename SIMD_T, uint32_t NumVertsPerPrimT> -class Clipper -{ -public: - INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) : - workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC)) - { - static_assert(NumVertsPerPrimT >= 1 && NumVertsPerPrimT <= 3, "Invalid NumVertsPerPrim"); - THREAD_DATA &thread_data = in_pDC->pContext->threadPool.pThreadData[workerId]; - - if (thread_data.clipperData == nullptr) - { - // 7 vertex temp data - // 7 post-clipped vertices - // 2 transposed verts for binning - size_t alloc_size = sizeof(SIMDVERTEX_T<SIMD_T>) * (7 + 7 + 2); - thread_data.clipperData = AlignedMalloc(alloc_size, KNOB_SIMD16_BYTES); - } - SWR_ASSERT(thread_data.clipperData); - - this->clippedVerts = (SIMDVERTEX_T<SIMD_T>*)thread_data.clipperData; - this->tmpVerts = this->clippedVerts + 7; - this->transposedVerts = this->tmpVerts + 7; - } - - void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes) - { - for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) - { - ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes); - } - } - - Float<SIMD_T> ComputeClipCodeIntersection() - { - Float<SIMD_T> result = clipCodes[0]; - - for (uint32_t i = 1; i < NumVertsPerPrimT; ++i) - { - result = SIMD_T::and_ps(result, clipCodes[i]); - } - - return result; - } - - Float<SIMD_T> ComputeClipCodeUnion() - { - Float<SIMD_T> result = clipCodes[0]; - - for (uint32_t i = 1; i < NumVertsPerPrimT; ++i) - { - result = SIMD_T::or_ps(result, clipCodes[i]); - } - - return result; - } - - int ComputeClipMask() - { - Float<SIMD_T> clipUnion = ComputeClipCodeUnion(); - - clipUnion = - SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK))); - - return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps())); - } - - // clipper is responsible for culling any prims with NAN coordinates - int ComputeNaNMask(Vec4<SIMD_T> prim[]) - { - Float<SIMD_T> vNanMask = SIMD_T::setzero_ps(); - - for (uint32_t e = 0; e < NumVertsPerPrimT; ++e) - { - Float<SIMD_T> vNan01 = - SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]); - vNanMask = SIMD_T::or_ps(vNanMask, vNan01); - - Float<SIMD_T> vNan23 = - SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]); - vNanMask = SIMD_T::or_ps(vNanMask, vNan23); - } - - return SIMD_T::movemask_ps(vNanMask); - } - - int ComputeUserClipCullMask(PA_STATE& pa, Vec4<SIMD_T> prim[]) - { - uint8_t cullMask = state.backendState.cullDistanceMask; - uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset; - - Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps(); - - Vec4<SIMD_T> vClipCullDistLo[3]; - Vec4<SIMD_T> vClipCullDistHi[3]; - - pa.Assemble(vertexClipCullOffset, vClipCullDistLo); - pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi); - - unsigned long index; - while (_BitScanForward(&index, cullMask)) - { - cullMask &= ~(1 << index); - uint32_t slot = index >> 2; - uint32_t component = index & 0x3; - - Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f); - for (uint32_t e = 0; e < NumVertsPerPrimT; ++e) - { - Float<SIMD_T> vCullComp; - if (slot == 0) - { - vCullComp = vClipCullDistLo[e][component]; - } - else - { - vCullComp = vClipCullDistHi[e][component]; - } - - // cull if cull distance < 0 || NAN - Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>( - SIMD_T::setzero_ps(), vCullComp); - vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull); - } - vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem); - } - - // clipper should also discard any primitive with NAN clip distance - uint8_t clipMask = state.backendState.clipDistanceMask; - while (_BitScanForward(&index, clipMask)) - { - clipMask &= ~(1 << index); - uint32_t slot = index >> 2; - uint32_t component = index & 0x3; - - Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f); - for (uint32_t e = 0; e < NumVertsPerPrimT; ++e) - { - Float<SIMD_T> vClipComp; - if (slot == 0) - { - vClipComp = vClipCullDistLo[e][component]; - } - else - { - vClipComp = vClipCullDistHi[e][component]; - } - - Float<SIMD_T> vClip = - SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp); - Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>( - SIMD_T::setzero_ps(), vClipComp); - vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull); - vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip); - } - vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem); - } - - return SIMD_T::movemask_ps(vClipCullMask); - } - - void ClipSimd(const Vec4<SIMD_T> prim[], - const Float<SIMD_T>& vPrimMask, - const Float<SIMD_T>& vClipMask, - PA_STATE& pa, - const Integer<SIMD_T>& vPrimId, - const Integer<SIMD_T>& vViewportIdx, - const Integer<SIMD_T>& vRtIdx) - { - // input/output vertex store for clipper - SIMDVERTEX_T<SIMD_T>* vertices = this->clippedVerts; - - uint32_t constantInterpMask = state.backendState.constantInterpolationMask; - uint32_t provokingVertex = 0; - if (pa.binTopology == TOP_TRIANGLE_FAN) - { - provokingVertex = state.frontendState.provokingVertex.triFan; - } - ///@todo: line topology for wireframe? - - // assemble pos - Vec4<SIMD_T> tmpVector[NumVertsPerPrimT]; - for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) - { - vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i]; - } - - // assemble attribs - const SWR_BACKEND_STATE& backendState = state.backendState; - - int32_t maxSlot = -1; - for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot) - { - // Compute absolute attrib slot in vertex array - uint32_t mapSlot = - backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot; - maxSlot = std::max<int32_t>(maxSlot, mapSlot); - uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot; - - pa.Assemble(inputSlot, tmpVector); - - // if constant interpolation enabled for this attribute, assign the provoking - // vertex values to all edges - if (CheckBit(constantInterpMask, slot)) - { - for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) - { - vertices[i].attrib[inputSlot] = tmpVector[provokingVertex]; - } - } - else - { - for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) - { - vertices[i].attrib[inputSlot] = tmpVector[i]; - } - } - } - - // assemble user clip distances if enabled - uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset; - if (state.backendState.clipDistanceMask & 0xf) - { - pa.Assemble(vertexClipCullSlot, tmpVector); - for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) - { - vertices[i].attrib[vertexClipCullSlot] = tmpVector[i]; - } - } - - if (state.backendState.clipDistanceMask & 0xf0) - { - pa.Assemble(vertexClipCullSlot + 1, tmpVector); - for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) - { - vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i]; - } - } - - uint32_t numAttribs = maxSlot + 1; - - Integer<SIMD_T> vNumClippedVerts = - ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); - - BinnerChooser<SIMD_T> binner(NumVertsPerPrimT, - pa.pDC->pState->state.rastState.conservativeRast); - - // set up new PA for binning clipped primitives - PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN; - if (NumVertsPerPrimT == 3) - { - clipTopology = TOP_TRIANGLE_FAN; - - // so that the binner knows to bloat wide points later - if (pa.binTopology == TOP_POINT_LIST) - { - clipTopology = TOP_POINT_LIST; - } - else if (pa.binTopology == TOP_RECT_LIST) - { - clipTopology = TOP_RECT_LIST; - } - } - else if (NumVertsPerPrimT == 2) - { - clipTopology = TOP_LINE_LIST; - } - else - { - SWR_ASSERT(0 && "Unexpected points in clipper."); - } - - const uint32_t* pVertexCount = reinterpret_cast<const uint32_t*>(&vNumClippedVerts); - const uint32_t* pPrimitiveId = reinterpret_cast<const uint32_t*>(&vPrimId); - const uint32_t* pViewportIdx = reinterpret_cast<const uint32_t*>(&vViewportIdx); - const uint32_t* pRtIdx = reinterpret_cast<const uint32_t*>(&vRtIdx); - - const SIMD256::Integer vOffsets = - SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane - 6 * sizeof(SIMDVERTEX_T<SIMD_T>), - 5 * sizeof(SIMDVERTEX_T<SIMD_T>), - 4 * sizeof(SIMDVERTEX_T<SIMD_T>), - 3 * sizeof(SIMDVERTEX_T<SIMD_T>), - 2 * sizeof(SIMDVERTEX_T<SIMD_T>), - 1 * sizeof(SIMDVERTEX_T<SIMD_T>), - 0 * sizeof(SIMDVERTEX_T<SIMD_T>)); - - // only need to gather 7 verts - // @todo dynamic mask based on actual # of verts generated per lane - const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1); - - uint32_t numClippedPrims = 0; - - // transpose clipper output so that each lane's vertices are in SIMD order - // set aside space for 2 vertices, as the PA will try to read up to 16 verts - // for triangle fan - SIMDVERTEX_T<SIMD_T>* transposedPrims = this->transposedVerts; - - uint32_t numInputPrims = pa.NumPrims(); - for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) - { - uint32_t numEmittedVerts = pVertexCount[inputPrim]; - if (numEmittedVerts < NumVertsPerPrimT) - { - continue; - } - SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper."); - - uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts); - SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper."); - - numClippedPrims += numEmittedPrims; - - // tranpose clipper output so that each lane's vertices are in SIMD order - // set aside space for 2 vertices, as the PA will try to read up to 16 verts - // for triangle fan - - // transpose pos - float const* pBase = - reinterpret_cast<float const*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + - inputPrim; - - for (uint32_t c = 0; c < 4; ++c) - { - SIMD256::Float temp = - SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask); - transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = - SimdHelper<SIMD_T>::insert_lo_ps(temp); - pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>)); - } - - // transpose attribs - pBase = reinterpret_cast<float const*>( - &vertices[0].attrib[backendState.vertexAttribOffset]) + - inputPrim; - - for (uint32_t attrib = 0; attrib < numAttribs; ++attrib) - { - uint32_t attribSlot = backendState.vertexAttribOffset + attrib; - - for (uint32_t c = 0; c < 4; ++c) - { - SIMD256::Float temp = - SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask); - transposedPrims[0].attrib[attribSlot][c] = - SimdHelper<SIMD_T>::insert_lo_ps(temp); - pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>)); - } - } - - // transpose user clip distances if enabled - uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset; - if (state.backendState.clipDistanceMask & 0x0f) - { - pBase = reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot]) + - inputPrim; - - for (uint32_t c = 0; c < 4; ++c) - { - SIMD256::Float temp = - SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask); - transposedPrims[0].attrib[vertexClipCullSlot][c] = - SimdHelper<SIMD_T>::insert_lo_ps(temp); - pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>)); - } - } - - if (state.backendState.clipDistanceMask & 0xf0) - { - pBase = - reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot + 1]) + - inputPrim; - - for (uint32_t c = 0; c < 4; ++c) - { - SIMD256::Float temp = - SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask); - transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = - SimdHelper<SIMD_T>::insert_lo_ps(temp); - pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>)); - } - } - - PA_STATE_OPT clipPA(pDC, - numEmittedPrims, - reinterpret_cast<uint8_t*>(&transposedPrims[0]), - numEmittedVerts, - SWR_VTX_NUM_SLOTS, - true, - NumVertsPerPrimT, - clipTopology); - clipPA.viewportArrayActive = pa.viewportArrayActive; - clipPA.rtArrayActive = pa.rtArrayActive; - - static const uint32_t primMaskMap[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f}; - - const uint32_t primMask = primMaskMap[numEmittedPrims]; - - const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]); - const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]); - const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]); - - while (clipPA.GetNextStreamOutput()) - { - do - { - Vec4<SIMD_T> attrib[NumVertsPerPrimT]; - - bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib); - - if (assemble) - { - binner.pfnBinFunc( - pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx); - } - - } while (clipPA.NextPrim()); - } - } - - // update global pipeline stat - UPDATE_STAT_FE(CPrimitives, numClippedPrims); - } - - void ExecuteStage(PA_STATE& pa, - Vec4<SIMD_T> prim[], - uint32_t primMask, - Integer<SIMD_T> const& primId, - Integer<SIMD_T> const& viewportIdx, - Integer<SIMD_T> const& rtIdx) - { - SWR_ASSERT(pa.pDC != nullptr); - - BinnerChooser<SIMD_T> binner(pa.binTopology, - pa.pDC->pState->state.rastState.conservativeRast); - - // update clipper invocations pipeline stat - uint32_t numInvoc = _mm_popcnt_u32(primMask); - UPDATE_STAT_FE(CInvocations, numInvoc); - - ComputeClipCodes(prim, viewportIdx); - - // cull prims with NAN coords - primMask &= ~ComputeNaNMask(prim); - - // user cull distance cull - if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask) - { - primMask &= ~ComputeUserClipCullMask(pa, prim); - } - - Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection(); - // Mask out non-frustum codes - clipIntersection = SIMD_T::and_ps(clipIntersection, - SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK))); - - // cull prims outside view frustum - int validMask = - primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps()); - - // skip clipping for points - uint32_t clipMask = 0; - if (NumVertsPerPrimT != 1) - { - clipMask = validMask & ComputeClipMask(); - } - - AR_EVENT(ClipInfoEvent(numInvoc, validMask, clipMask)); - - if (clipMask) - { - RDTSC_BEGIN(pa.pDC->pContext->pBucketMgr, FEGuardbandClip, pa.pDC->drawId); - // we have to clip tris, execute the clipper, which will also - // call the binner - ClipSimd(prim, - SIMD_T::vmask_ps(validMask), - SIMD_T::vmask_ps(clipMask), - pa, - primId, - viewportIdx, - rtIdx); - RDTSC_END(pa.pDC->pContext->pBucketMgr, FEGuardbandClip, 1); - } - else if (validMask) - { - // update CPrimitives pipeline state - UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); - - // forward valid prims directly to binner - binner.pfnBinFunc( - this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx); - } - } - -private: - Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const& boundaryCoord0, - Float<SIMD_T> const& boundaryCoord1) - { - return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1)); - } - - Integer<SIMD_T> - ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const& vIndices, uint32_t component) - { - const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>); - const uint32_t componentStride = sizeof(Float<SIMD_T>); - const uint32_t attribStride = sizeof(Vec4<SIMD_T>); - - static const OSALIGNSIMD16(uint32_t) elemOffset[16] = { - 0 * sizeof(float), - 1 * sizeof(float), - 2 * sizeof(float), - 3 * sizeof(float), - 4 * sizeof(float), - 5 * sizeof(float), - 6 * sizeof(float), - 7 * sizeof(float), - 8 * sizeof(float), - 9 * sizeof(float), - 10 * sizeof(float), - 11 * sizeof(float), - 12 * sizeof(float), - 13 * sizeof(float), - 14 * sizeof(float), - 15 * sizeof(float), - }; - - static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset), - "Clipper::ComputeOffsets, Increase number of element offsets."); - - Integer<SIMD_T> vElemOffset = - SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T>*>(elemOffset)); - - // step to the simdvertex - Integer<SIMD_T> vOffsets = - SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride)); - - // step to the attribute and component - vOffsets = SIMD_T::add_epi32( - vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component)); - - // step to the lane - vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset); - - return vOffsets; - } - - Float<SIMD_T> GatherComponent(const float* pBuffer, - uint32_t attrib, - Float<SIMD_T> const& vMask, - Integer<SIMD_T> const& vIndices, - uint32_t component) - { - Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component); - Float<SIMD_T> vSrc = SIMD_T::setzero_ps(); - - return SIMD_T::mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask); - } - - void ScatterComponent(const float* pBuffer, - uint32_t attrib, - Float<SIMD_T> const& vMask, - Integer<SIMD_T> const& vIndices, - uint32_t component, - Float<SIMD_T> const& vSrc) - { - Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component); - - const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets); - const float* pSrc = reinterpret_cast<const float*>(&vSrc); - uint32_t mask = SIMD_T::movemask_ps(vMask); - unsigned long lane; - while (_BitScanForward(&lane, mask)) - { - mask &= ~(1 << lane); - const uint8_t* pBuf = reinterpret_cast<const uint8_t*>(pBuffer) + pOffsets[lane]; - *(float*)pBuf = pSrc[lane]; - } - } - - template <SWR_CLIPCODES ClippingPlane> - void intersect(const Float<SIMD_T>& vActiveMask, // active lanes to operate on - const Integer<SIMD_T>& s, // index to first edge vertex v0 in pInPts. - const Integer<SIMD_T>& p, // index to second edge vertex v1 in pInPts. - const Vec4<SIMD_T>& v1, // vertex 0 position - const Vec4<SIMD_T>& v2, // vertex 1 position - Integer<SIMD_T>& outIndex, // output index. - const float* pInVerts, // array of all the input positions. - uint32_t numInAttribs, // number of attributes per vertex. - float* pOutVerts) // array of output positions. We'll write our new intersection - // point at i*4. - { - uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; - uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset; - - // compute interpolation factor - Float<SIMD_T> t; - switch (ClippingPlane) - { - case FRUSTUM_LEFT: - t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); - break; - case FRUSTUM_RIGHT: - t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); - break; - case FRUSTUM_TOP: - t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); - break; - case FRUSTUM_BOTTOM: - t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); - break; - case FRUSTUM_NEAR: - // DX Znear plane is 0, GL is -w - if (this->state.rastState.clipHalfZ) - { - t = ComputeInterpFactor(v1[2], v2[2]); - } - else - { - t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2])); - } - break; - case FRUSTUM_FAR: - t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); - break; - default: - SWR_INVALID("invalid clipping plane: %d", ClippingPlane); - }; - - // interpolate position and store - for (uint32_t c = 0; c < 4; ++c) - { - Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]); - ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos); - } - - // interpolate attributes and store - for (uint32_t a = 0; a < numInAttribs; ++a) - { - uint32_t attribSlot = vertexAttribOffset + a; - for (uint32_t c = 0; c < 4; ++c) - { - Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); - Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); - Float<SIMD_T> vOutAttrib = - SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); - ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); - } - } - - // interpolate clip distance if enabled - if (this->state.backendState.clipDistanceMask & 0xf) - { - uint32_t attribSlot = vertexClipCullOffset; - for (uint32_t c = 0; c < 4; ++c) - { - Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); - Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); - Float<SIMD_T> vOutAttrib = - SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); - ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); - } - } - - if (this->state.backendState.clipDistanceMask & 0xf0) - { - uint32_t attribSlot = vertexClipCullOffset + 1; - for (uint32_t c = 0; c < 4; ++c) - { - Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); - Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); - Float<SIMD_T> vOutAttrib = - SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); - ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); - } - } - } - - template <SWR_CLIPCODES ClippingPlane> - Float<SIMD_T> inside(const Vec4<SIMD_T>& v) - { - switch (ClippingPlane) - { - case FRUSTUM_LEFT: - return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); - case FRUSTUM_RIGHT: - return SIMD_T::cmple_ps(v[0], v[3]); - case FRUSTUM_TOP: - return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); - case FRUSTUM_BOTTOM: - return SIMD_T::cmple_ps(v[1], v[3]); - case FRUSTUM_NEAR: - return SIMD_T::cmpge_ps(v[2], - this->state.rastState.clipHalfZ - ? SIMD_T::setzero_ps() - : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); - case FRUSTUM_FAR: - return SIMD_T::cmple_ps(v[2], v[3]); - default: - SWR_INVALID("invalid clipping plane: %d", ClippingPlane); - return SIMD_T::setzero_ps(); - } - } - - template <SWR_CLIPCODES ClippingPlane> - Integer<SIMD_T> ClipTriToPlane(const float* pInVerts, - const Integer<SIMD_T>& vNumInPts, - uint32_t numInAttribs, - float* pOutVerts) - { - uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; - - Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si(); - Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si(); - Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); - - while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty - { - Integer<SIMD_T> s = vCurIndex; - Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1)); - Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p); - p = SIMD_T::castps_si(SIMD_T::blendv_ps( - SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask))); - - // gather position - Vec4<SIMD_T> vInPos0, vInPos1; - for (uint32_t c = 0; c < 4; ++c) - { - vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); - vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); - } - - // compute inside mask - Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0); - Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1); - - // compute intersection mask (s_in != p_in) - Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in); - intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask); - - // store s if inside - s_in = SIMD_T::and_ps(s_in, vActiveMask); - if (!SIMD_T::testz_ps(s_in, s_in)) - { - // store position - for (uint32_t c = 0; c < 4; ++c) - { - ScatterComponent( - pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); - } - - // store attribs - for (uint32_t a = 0; a < numInAttribs; ++a) - { - uint32_t attribSlot = vertexAttribOffset + a; - for (uint32_t c = 0; c < 4; ++c) - { - Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); - ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); - } - } - - // store clip distance if enabled - uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset; - if (this->state.backendState.clipDistanceMask & 0xf) - { - uint32_t attribSlot = vertexClipCullSlot; - for (uint32_t c = 0; c < 4; ++c) - { - Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); - ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); - } - } - - if (this->state.backendState.clipDistanceMask & 0xf0) - { - uint32_t attribSlot = vertexClipCullSlot + 1; - for (uint32_t c = 0; c < 4; ++c) - { - Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); - ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); - } - } - - // increment outIndex - vOutIndex = SIMD_T::blendv_epi32( - vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in); - } - - // compute and store intersection - if (!SIMD_T::testz_ps(intersectMask, intersectMask)) - { - intersect<ClippingPlane>(intersectMask, - s, - p, - vInPos0, - vInPos1, - vOutIndex, - pInVerts, - numInAttribs, - pOutVerts); - - // increment outIndex for active lanes - vOutIndex = SIMD_T::blendv_epi32( - vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask); - } - - // increment loop index and update active mask - vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1)); - vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); - } - - return vOutIndex; - } - - template <SWR_CLIPCODES ClippingPlane> - Integer<SIMD_T> ClipLineToPlane(const float* pInVerts, - const Integer<SIMD_T>& vNumInPts, - uint32_t numInAttribs, - float* pOutVerts) - { - uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; - - Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si(); - Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si(); - Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); - - if (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) - { - Integer<SIMD_T> s = vCurIndex; - Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1)); - - // gather position - Vec4<SIMD_T> vInPos0, vInPos1; - for (uint32_t c = 0; c < 4; ++c) - { - vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); - vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); - } - - // compute inside mask - Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0); - Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1); - - // compute intersection mask (s_in != p_in) - Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in); - intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask); - - // store s if inside - s_in = SIMD_T::and_ps(s_in, vActiveMask); - if (!SIMD_T::testz_ps(s_in, s_in)) - { - for (uint32_t c = 0; c < 4; ++c) - { - ScatterComponent( - pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); - } - - // interpolate attributes and store - for (uint32_t a = 0; a < numInAttribs; ++a) - { - uint32_t attribSlot = vertexAttribOffset + a; - for (uint32_t c = 0; c < 4; ++c) - { - Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); - ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); - } - } - - // increment outIndex - vOutIndex = SIMD_T::blendv_epi32( - vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in); - } - - // compute and store intersection - if (!SIMD_T::testz_ps(intersectMask, intersectMask)) - { - intersect<ClippingPlane>(intersectMask, - s, - p, - vInPos0, - vInPos1, - vOutIndex, - pInVerts, - numInAttribs, - pOutVerts); - - // increment outIndex for active lanes - vOutIndex = SIMD_T::blendv_epi32( - vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask); - } - - // store p if inside - p_in = SIMD_T::and_ps(p_in, vActiveMask); - if (!SIMD_T::testz_ps(p_in, p_in)) - { - for (uint32_t c = 0; c < 4; ++c) - { - ScatterComponent( - pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]); - } - - // interpolate attributes and store - for (uint32_t a = 0; a < numInAttribs; ++a) - { - uint32_t attribSlot = vertexAttribOffset + a; - for (uint32_t c = 0; c < 4; ++c) - { - Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c); - ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib); - } - } - - // increment outIndex - vOutIndex = SIMD_T::blendv_epi32( - vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in); - } - } - - return vOutIndex; - } - - Integer<SIMD_T> ClipPrims(float* pVertices, - const Float<SIMD_T>& vPrimMask, - const Float<SIMD_T>& vClipMask, - int numAttribs) - { - // temp storage - float* pTempVerts = reinterpret_cast<float*>(this->tmpVerts); - - // zero out num input verts for non-active lanes - Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrimT); - vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask); - - // clip prims to frustum - Integer<SIMD_T> vNumOutPts; - if (NumVertsPerPrimT == 3) - { - vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); - vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); - vNumOutPts = - ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); - vNumOutPts = - ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); - vNumOutPts = - ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); - vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); - } - else - { - SWR_ASSERT(NumVertsPerPrimT == 2); - vNumOutPts = - ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); - vNumOutPts = - ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); - vNumOutPts = - ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); - vNumOutPts = - ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); - vNumOutPts = - ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); - vNumOutPts = - ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); - } - - // restore num verts for non-clipped, active lanes - Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask); - vNumOutPts = - SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrimT), vNonClippedMask); - - return vNumOutPts; - } - - const uint32_t workerId{0}; - DRAW_CONTEXT* pDC{nullptr}; - const API_STATE& state; - Float<SIMD_T> clipCodes[NumVertsPerPrimT]; - SIMDVERTEX_T<SIMD_T>* clippedVerts; - SIMDVERTEX_T<SIMD_T>* tmpVerts; - SIMDVERTEX_T<SIMD_T>* transposedVerts; -}; - -// pipeline stage functions -void ClipRectangles(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[], - uint32_t primMask, - simdscalari const& primId, - simdscalari const& viewportIdx, - simdscalari const& rtIdx); -void ClipTriangles(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[], - uint32_t primMask, - simdscalari const& primId, - simdscalari const& viewportIdx, - simdscalari const& rtIdx); -void ClipLines(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[], - uint32_t primMask, - simdscalari const& primId, - simdscalari const& viewportIdx, - simdscalari const& rtIdx); -void ClipPoints(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[], - uint32_t primMask, - simdscalari const& primId, - simdscalari const& viewportIdx, - simdscalari const& rtIdx); -#if USE_SIMD16_FRONTEND -void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[], - uint32_t primMask, - simd16scalari const& primId, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx); -void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[], - uint32_t primMask, - simd16scalari const& primId, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx); -void SIMDCALL ClipLines_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[], - uint32_t primMask, - simd16scalari const& primId, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx); -void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[], - uint32_t primMask, - simd16scalari const& primId, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx); -#endif diff --git a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h deleted file mode 100644 index 9e7f96cdeac..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h +++ /dev/null @@ -1,229 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file conservativerast.h - * - ******************************************************************************/ -#pragma once -#include <type_traits> -#include "common/simdintrin.h" - -enum FixedPointFmt -{ - FP_UNINIT, - _16_8, - _16_9, - _X_16, -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief convenience typedefs for supported Fixed Point precisions -typedef std::integral_constant<uint32_t, FP_UNINIT> Fixed_Uninit; -typedef std::integral_constant<uint32_t, _16_8> Fixed_16_8; -typedef std::integral_constant<uint32_t, _16_9> Fixed_16_9; -typedef std::integral_constant<uint32_t, _X_16> Fixed_X_16; - -////////////////////////////////////////////////////////////////////////// -/// @struct FixedPointTraits -/// @brief holds constants relating to converting between FP and Fixed point -/// @tparam FT: fixed precision type -template <typename FT> -struct FixedPointTraits -{ -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Fixed_16_8 specialization of FixedPointTraits -template <> -struct FixedPointTraits<Fixed_16_8> -{ - /// multiplier to go from FP32 to Fixed Point 16.8 - typedef std::integral_constant<uint32_t, 256> ScaleT; - /// number of bits to shift to go from 16.8 fixed => int32 - typedef std::integral_constant<uint32_t, 8> BitsT; - typedef Fixed_16_8 TypeT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Fixed_16_9 specialization of FixedPointTraits -template <> -struct FixedPointTraits<Fixed_16_9> -{ - /// multiplier to go from FP32 to Fixed Point 16.9 - typedef std::integral_constant<uint32_t, 512> ScaleT; - /// number of bits to shift to go from 16.9 fixed => int32 - typedef std::integral_constant<uint32_t, 9> BitsT; - typedef Fixed_16_9 TypeT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Fixed_16_9 specialization of FixedPointTraits -template <> -struct FixedPointTraits<Fixed_X_16> -{ - /// multiplier to go from FP32 to Fixed Point X.16 - typedef std::integral_constant<uint32_t, 65536> ScaleT; - /// number of bits to shift to go from X.16 fixed => int32 - typedef std::integral_constant<uint32_t, 16> BitsT; - typedef Fixed_X_16 TypeT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief convenience typedefs for conservative rasterization modes -typedef std::false_type StandardRastT; -typedef std::true_type ConservativeRastT; - -////////////////////////////////////////////////////////////////////////// -/// @brief convenience typedefs for Input Coverage rasterization modes -typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NONE> NoInputCoverageT; -typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NORMAL> OuterConservativeCoverageT; -typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE> - InnerConservativeCoverageT; - -////////////////////////////////////////////////////////////////////////// -/// @struct ConservativeRastTraits -/// @brief primary ConservativeRastTraits template. Shouldn't be instantiated -/// @tparam ConservativeT: type of conservative rasterization -template <typename ConservativeT> -struct ConservativeRastFETraits -{ -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief StandardRast specialization of ConservativeRastTraits -template <> -struct ConservativeRastFETraits<StandardRastT> -{ - typedef std::false_type IsConservativeT; - typedef std::integral_constant<uint32_t, 0> BoundingBoxOffsetT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief ConservativeRastT specialization of ConservativeRastTraits -template <> -struct ConservativeRastFETraits<ConservativeRastT> -{ - typedef std::true_type IsConservativeT; - typedef std::integral_constant<uint32_t, 1> BoundingBoxOffsetT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief convenience typedefs for ConservativeRastFETraits -typedef ConservativeRastFETraits<StandardRastT> FEStandardRastT; -typedef ConservativeRastFETraits<ConservativeRastT> FEConservativeRastT; - -////////////////////////////////////////////////////////////////////////// -/// @struct ConservativeRastBETraits -/// @brief primary ConservativeRastBETraits template. Shouldn't be instantiated; -/// default to standard rasterization behavior -/// @tparam ConservativeT: type of conservative rasterization -/// @tparam InputCoverageT: type of input coverage requested, if any -template <typename ConservativeT, typename _InputCoverageT> -struct ConservativeRastBETraits -{ - typedef std::false_type IsConservativeT; - typedef _InputCoverageT InputCoverageT; - typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT; - typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT; - typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief StandardRastT specialization of ConservativeRastBETraits -template <typename _InputCoverageT> -struct ConservativeRastBETraits<StandardRastT, _InputCoverageT> -{ - typedef std::false_type IsConservativeT; - typedef _InputCoverageT InputCoverageT; - typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT; - typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT; - typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief ConservativeRastT specialization of ConservativeRastBETraits -/// with no input coverage -template <> -struct ConservativeRastBETraits<ConservativeRastT, NoInputCoverageT> -{ - typedef std::true_type IsConservativeT; - typedef NoInputCoverageT InputCoverageT; - - typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT; - - /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision - /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead - /// of of having to compare individual edges to pixel corners to check if any part of the - /// triangle intersects a pixel - typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1> - ConservativeEdgeOffsetT; - typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief ConservativeRastT specialization of ConservativeRastBETraits -/// with OuterConservativeCoverage -template <> -struct ConservativeRastBETraits<ConservativeRastT, OuterConservativeCoverageT> -{ - typedef std::true_type IsConservativeT; - typedef OuterConservativeCoverageT InputCoverageT; - - typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT; - - /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision - /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead - /// of of having to compare individual edges to pixel corners to check if any part of the - /// triangle intersects a pixel - typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1> - ConservativeEdgeOffsetT; - typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief ConservativeRastT specialization of ConservativeRastBETraits -/// with InnerConservativeCoverage -template <> -struct ConservativeRastBETraits<ConservativeRastT, InnerConservativeCoverageT> -{ - typedef std::true_type IsConservativeT; - typedef InnerConservativeCoverageT InputCoverageT; - - typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT; - - /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision - /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead - /// of of having to compare individual edges to pixel corners to check if any part of the - /// triangle intersects a pixel - typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1> - ConservativeEdgeOffsetT; - - /// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel + - /// 1/512, in Fixed 16.9 precision this allows the rasterizer to do the 3 edge coverage tests - /// against a single point, instead of of having to compare individual edges to pixel corners to - /// check if a pixel is fully covered by a triangle - typedef std::integral_constant<int32_t, - static_cast<int32_t>( - -((ConservativePrecisionT::ScaleT::value / 2) + 1) - - ConservativeEdgeOffsetT::value)> - InnerConservativeEdgeOffsetT; -};
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h deleted file mode 100644 index b874520b9d8..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ /dev/null @@ -1,608 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file context.h - * - * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT - * The SWR_CONTEXT is our global context and contains the DC ring, - * thread state, etc. - * - * The DRAW_CONTEXT contains all state associated with a draw operation. - * - ******************************************************************************/ -#pragma once - -#include <condition_variable> -#include <algorithm> - -#include "core/api.h" -#include "core/utils.h" -#include "core/arena.h" -#include "core/fifo.hpp" -#include "core/knobs.h" -#include "common/intrin.h" -#include "common/rdtsc_buckets.h" -#include "core/threads.h" -#include "ringbuffer.h" -#include "archrast/archrast.h" - -// x.8 fixed point precision values -#define FIXED_POINT_SHIFT 8 -#define FIXED_POINT_SCALE 256 - -// x.16 fixed point precision values -#define FIXED_POINT16_SHIFT 16 -#define FIXED_POINT16_SCALE 65536 - -struct SWR_CONTEXT; -struct DRAW_CONTEXT; - -struct TRI_FLAGS -{ - uint32_t frontFacing : 1; - uint32_t yMajor : 1; - uint32_t coverageMask : (SIMD_TILE_X_DIM* SIMD_TILE_Y_DIM); - uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); - float pointSize; - uint32_t renderTargetArrayIndex; - uint32_t viewportIndex; -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_TRIANGLE_DESC -///////////////////////////////////////////////////////////////////////// -struct SWR_TRIANGLE_DESC -{ - float I[3]; - float J[3]; - float Z[3]; - float OneOverW[3]; - float recipDet; - - float* pRecipW; - float* pAttribs; - float* pPerspAttribs; - float* pSamplePos; - float* pUserClipBuffer; - - uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; - uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if - // entire pixel is covered - uint64_t anyCoveredSamples; - - TRI_FLAGS triFlags; -}; - -struct TRIANGLE_WORK_DESC -{ - float* pTriBuffer; - float* pAttribs; - float* pUserClipBuffer; - uint32_t numAttribs; - TRI_FLAGS triFlags; -}; - -struct CLEAR_DESC -{ - SWR_RECT rect; - uint32_t attachmentMask; - uint32_t renderTargetArrayIndex; - float clearRTColor[4]; // RGBA_32F - float clearDepth; // [0..1] - uint8_t clearStencil; -}; - -struct DISCARD_INVALIDATE_TILES_DESC -{ - uint32_t attachmentMask; - SWR_RECT rect; - SWR_TILE_STATE newTileState; - bool createNewTiles; - bool fullTilesOnly; -}; - -struct SYNC_DESC -{ - PFN_CALLBACK_FUNC pfnCallbackFunc; - uint64_t userData; - uint64_t userData2; - uint64_t userData3; -}; - -struct STORE_TILES_DESC -{ - uint32_t attachmentMask; - SWR_TILE_STATE postStoreTileState; - SWR_RECT rect; -}; - -struct COMPUTE_DESC -{ - uint32_t threadGroupCountX; - uint32_t threadGroupCountY; - uint32_t threadGroupCountZ; - bool enableThreadDispatch; -}; - -typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t macroTile, - void* pDesc); - -enum WORK_TYPE -{ - SYNC, - DRAW, - CLEAR, - DISCARDINVALIDATETILES, - STORETILES, - SHUTDOWN, -}; - -OSALIGNSIMD(struct) BE_WORK -{ - WORK_TYPE type; - PFN_WORK_FUNC pfnWork; - union - { - SYNC_DESC sync; - TRIANGLE_WORK_DESC tri; - CLEAR_DESC clear; - DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; - STORE_TILES_DESC storeTiles; - } desc; -}; - -struct DRAW_WORK -{ - DRAW_CONTEXT* pDC; - union - { - uint32_t numIndices; // DrawIndexed: Number of indices for draw. - uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc) - }; - union - { - gfxptr_t xpIB; // DrawIndexed: App supplied int32 indices - uint32_t startVertex; // Draw: Starting vertex in VB to render from. - }; - int32_t baseVertex; - uint32_t numInstances; // Number of instances - uint32_t startInstance; // Instance offset - uint32_t startPrimID; // starting primitiveID for this draw batch - uint32_t - startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws) - SWR_FORMAT type; // index buffer type -}; - -typedef void (*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, - DRAW_CONTEXT* pDC, - uint32_t workerId, - void* pDesc); -struct FE_WORK -{ - WORK_TYPE type; - PFN_FE_WORK_FUNC pfnWork; - union - { - SYNC_DESC sync; - DRAW_WORK draw; - CLEAR_DESC clear; - DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; - STORE_TILES_DESC storeTiles; - } desc; -}; - -struct GUARDBANDS -{ - float left[KNOB_NUM_VIEWPORTS_SCISSORS]; - float right[KNOB_NUM_VIEWPORTS_SCISSORS]; - float top[KNOB_NUM_VIEWPORTS_SCISSORS]; - float bottom[KNOB_NUM_VIEWPORTS_SCISSORS]; -}; - -struct PA_STATE; - -// function signature for pipeline stages that execute after primitive assembly -typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[], - uint32_t primMask, - simdscalari const& primID, - simdscalari const& viewportIdx, - simdscalari const& rtIdx); - -// function signature for pipeline stages that execute after primitive assembly -typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[], - uint32_t primMask, - simd16scalari const& primID, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx); - -OSALIGNLINE(struct) API_STATE -{ - // Vertex Buffers - SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS]; - - // GS - Geometry Shader State - SWR_GS_STATE gsState; - PFN_GS_FUNC pfnGsFunc; - - // FS - Fetch Shader State - PFN_FETCH_FUNC pfnFetchFunc; - - // VS - Vertex Shader State - PFN_VERTEX_FUNC pfnVertexFunc; - - // Index Buffer - SWR_INDEX_BUFFER_STATE indexBuffer; - - // CS - Compute Shader - PFN_CS_FUNC pfnCsFunc; - uint32_t totalThreadsInGroup; - uint32_t totalSpillFillSize; - uint32_t scratchSpaceSizePerWarp; - uint32_t scratchSpaceNumWarps; - - // FE - Frontend State - SWR_FRONTEND_STATE frontendState; - - // SOS - Streamout Shader State - PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS]; - - // Streamout state - SWR_STREAMOUT_STATE soState; - mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS]; - mutable SWR_STREAMOUT_BUFFER soPausedBuffer[MAX_SO_STREAMS]; - - // Tessellation State - PFN_HS_FUNC pfnHsFunc; - PFN_DS_FUNC pfnDsFunc; - SWR_TS_STATE tsState; - - // Number of attributes used by the frontend (vs, so, gs) - uint32_t feNumAttributes; - - // RS - Rasterizer State - SWR_RASTSTATE rastState; - // floating point multisample offsets - float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2]; - - GUARDBANDS gbState; - - SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS]; - SWR_VIEWPORT_MATRICES vpMatrices; - - SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; - SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS]; - bool scissorsTileAligned; - - bool forceFront; - PRIMITIVE_TOPOLOGY topology; - - - // Backend state - OSALIGNLINE(SWR_BACKEND_STATE) backendState; - - SWR_DEPTH_BOUNDS_STATE depthBoundsState; - - // PS - Pixel shader state - SWR_PS_STATE psState; - - SWR_DEPTH_STENCIL_STATE depthStencilState; - - // OM - Output Merger State - SWR_BLEND_STATE blendState; - PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS]; - - struct - { - uint32_t enableStatsFE : 1; // Enable frontend pipeline stats - uint32_t enableStatsBE : 1; // Enable backend pipeline stats - uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles - uint32_t depthHottileEnable : 1; // Enable depth buffer hottile - uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile - }; - - PFN_QUANTIZE_DEPTH pfnQuantizeDepth; -}; - -class MacroTileMgr; -class DispatchQueue; -class HOTTILE; - -struct RenderOutputBuffers -{ - uint8_t* pColor[SWR_NUM_RENDERTARGETS]; - uint8_t* pDepth; - uint8_t* pStencil; - - HOTTILE* pColorHotTile[SWR_NUM_RENDERTARGETS]; - HOTTILE* pDepthHotTile; - HOTTILE* pStencilHotTile; -}; - -// Plane equation A/B/C coeffs used to evaluate I/J barycentric coords -struct BarycentricCoeffs -{ - simdscalar vIa; - simdscalar vIb; - simdscalar vIc; - - simdscalar vJa; - simdscalar vJb; - simdscalar vJc; - - simdscalar vZa; - simdscalar vZb; - simdscalar vZc; - - simdscalar vRecipDet; - - simdscalar vAOneOverW; - simdscalar vBOneOverW; - simdscalar vCOneOverW; -}; - -// pipeline function pointer types -typedef void (*PFN_BACKEND_FUNC)( - DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&); -typedef void (*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT&, - uint8_t* (&)[SWR_NUM_RENDERTARGETS], - uint32_t, - const SWR_BLEND_STATE*, - const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], - simdscalar&, - simdscalar const&); -typedef void (*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&); -typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&); -typedef void (*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, - SWR_PS_CONTEXT&, - const uint64_t* const, - const uint32_t, - simdscalar const&, - simdscalar const&); - -struct BACKEND_FUNCS -{ - PFN_BACKEND_FUNC pfnBackend; -}; - -// Draw State -struct DRAW_STATE -{ - API_STATE state; - - void* pPrivateState; // Its required the driver sets this up for each draw. - - // pipeline function pointers, filled in by API thread when setting up the draw - BACKEND_FUNCS backendFuncs; - PFN_PROCESS_PRIMS pfnProcessPrims; -#if USE_SIMD16_FRONTEND - PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16; -#endif - - CachingArena* pArena; // This should only be used by API thread. -}; - -struct DRAW_DYNAMIC_STATE -{ - void Reset(uint32_t numThreads) - { - SWR_STATS* pSavePtr = pStats; - memset(this, 0, sizeof(*this)); - pStats = pSavePtr; - memset(pStats, 0, sizeof(SWR_STATS) * numThreads); - } - ///@todo Currently assumes only a single FE can do stream output for a draw. - uint32_t SoWriteOffset[4]; - bool SoWriteOffsetDirty[4]; - - SWR_STATS_FE statsFE; // Only one FE thread per DC. - SWR_STATS* pStats; - uint64_t soPrims; // number of primitives written to StreamOut buffer -}; - -// Draw Context -// The api thread sets up a draw context that exists for the life of the draw. -// This draw context maintains all of the state needed for the draw operation. -struct DRAW_CONTEXT -{ - SWR_CONTEXT* pContext; - union - { - MacroTileMgr* pTileMgr; - DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) - }; - DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread. - CachingArena* pArena; - - uint32_t drawId; - bool dependentFE; // Frontend work is dependent on all previous FE - bool dependent; // Backend work is dependent on all previous BE - bool isCompute; // Is this DC a compute context? - bool cleanupState; // True if this is the last draw using an entry in the state ring. - - FE_WORK FeWork; - - SYNC_DESC retireCallback; // Call this func when this DC is retired. - - DRAW_DYNAMIC_STATE dynState; - - volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? - volatile OSALIGNLINE(uint32_t) FeLock; - volatile OSALIGNLINE(uint32_t) threadsDone; -}; - -static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT"); - -INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) -{ - SWR_ASSERT(pDC != nullptr); - SWR_ASSERT(pDC->pState != nullptr); - - return pDC->pState->state; -} - -INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC) -{ - SWR_ASSERT(pDC != nullptr); - SWR_ASSERT(pDC->pState != nullptr); - - return pDC->pState->pPrivateState; -} - -class HotTileMgr; - -struct SWR_CONTEXT -{ - // Draw Context Ring - // Each draw needs its own state in order to support multiple draws in flight across multiple - // threads. We maintain N draw contexts configured as a ring. The size of the ring limits the - // maximum number of draws that can be in flight at any given time. - // - // Description: - // 1. State - When an application first sets state we'll request a new draw context to use. - // a. If there are no available draw contexts then we'll have to wait until one becomes - // free. b. If one is available then set pCurDrawContext to point to it and mark it in use. - // c. All state calls set state on pCurDrawContext. - // 2. Draw - Creates submits a work item that is associated with current draw context. - // a. Set pPrevDrawContext = pCurDrawContext - // b. Set pCurDrawContext to NULL. - // 3. State - When an applications sets state after draw - // a. Same as step 1. - // b. State is copied from prev draw context to current. - RingBuffer<DRAW_CONTEXT> dcRing; - - DRAW_CONTEXT* pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. - DRAW_CONTEXT* pPrevDrawContext; // This points to DC entry for the previous context submitted - // that we can copy state from. - - MacroTileMgr* pMacroTileManagerArray; - DispatchQueue* pDispatchQueueArray; - - // Draw State Ring - // When draw are very large (lots of primitives) then the API thread will break these up. - // These split draws all have identical state. So instead of storing the state directly - // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs - // to reference a single entry in the DS ring. - RingBuffer<DRAW_STATE> dsRing; - - uint32_t curStateId; // Current index to the next available entry in the DS ring. - - uint32_t NumWorkerThreads; - uint32_t NumFEThreads; - uint32_t NumBEThreads; - - THREAD_POOL threadPool; // Thread pool associated with this context - SWR_THREADING_INFO threadInfo; - SWR_API_THREADING_INFO apiThreadInfo; - SWR_WORKER_PRIVATE_STATE workerPrivateState; - - uint32_t MAX_DRAWS_IN_FLIGHT; - - std::condition_variable FifosNotEmpty; - std::mutex WaitLock; - - uint32_t privateStateSize; - - HotTileMgr* pHotTileMgr; - - // Callback functions, passed in at create context time - PFN_LOAD_TILE pfnLoadTile; - PFN_STORE_TILE pfnStoreTile; - PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead; - PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite; - PFN_MAKE_GFXPTR pfnMakeGfxPtr; - PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext; - PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext; - PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; - PFN_UPDATE_STATS pfnUpdateStats; - PFN_UPDATE_STATS_FE pfnUpdateStatsFE; - PFN_UPDATE_STREAMOUT pfnUpdateStreamOut; - - - // Global Stats - SWR_STATS* pStats; - - // Scratch space for workers. - uint8_t** ppScratch; - - volatile OSALIGNLINE(uint32_t) drawsOutstandingFE; - - OSALIGNLINE(CachingAllocator) cachingArenaAllocator; - uint32_t frameCount; - - uint32_t lastFrameChecked; - uint64_t lastDrawChecked; - TileSet* pSingleThreadLockedTiles; - - // ArchRast thread contexts. - HANDLE* pArContext; - - // handle to external memory for worker data to create memory contexts - HANDLE hExternalMemory; - - BucketManager *pBucketMgr; -}; - -#define UPDATE_STAT_BE(name, count) \ - if (GetApiState(pDC).enableStatsBE) \ - { \ - pDC->dynState.pStats[workerId].name += count; \ - } -#define UPDATE_STAT_FE(name, count) \ - if (GetApiState(pDC).enableStatsFE) \ - { \ - pDC->dynState.statsFE.name += count; \ - } - -// ArchRast instrumentation framework -#define AR_WORKER_CTX pDC->pContext->pArContext[workerId] -#define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads] - -#ifdef KNOB_ENABLE_RDTSC -#define RDTSC_BEGIN(pBucketMgr, type, drawid) RDTSC_START(pBucketMgr, type) -#define RDTSC_END(pBucketMgr, type, count) RDTSC_STOP(pBucketMgr, type, count, 0) -#else -#define RDTSC_BEGIN(pBucketMgr, type, drawid) -#define RDTSC_END(pBucketMgr, type, count) -#endif - -#ifdef KNOB_ENABLE_AR -#define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event) -#define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id) -#else -#define _AR_EVENT(ctx, event) -#define _AR_FLUSH(ctx, id) -#endif - -// Use these macros for api thread. -#define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event) - -// Use these macros for worker threads. -#define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event) -#define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id) diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h deleted file mode 100644 index 54a3489205a..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h +++ /dev/null @@ -1,335 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file depthstencil.h - * - * @brief Implements depth/stencil functionality - * - ******************************************************************************/ -#pragma once -#include "common/os.h" -#include "format_conversion.h" - -INLINE -void StencilOp(SWR_STENCILOP op, - simdscalar const& mask, - simdscalar const& stencilRefps, - simdscalar& stencilps) -{ - simdscalari stencil = _simd_castps_si(stencilps); - - switch (op) - { - case STENCILOP_KEEP: - break; - case STENCILOP_ZERO: - stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask); - break; - case STENCILOP_REPLACE: - stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask); - break; - case STENCILOP_INCRSAT: - { - simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1)); - stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); - break; - } - case STENCILOP_DECRSAT: - { - simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1)); - stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); - break; - } - case STENCILOP_INCR: - { - simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1)); - stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); - break; - } - case STENCILOP_DECR: - { - simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff)); - stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); - break; - } - case STENCILOP_INVERT: - { - simdscalar stencilinvert = - _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps())); - stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask); - break; - } - default: - break; - } -} - -template <SWR_FORMAT depthFormatT> -simdscalar QuantizeDepth(simdscalar const& depth) -{ - SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0); - uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0); - - if (depthType == SWR_TYPE_FLOAT) - { - // assume only 32bit float depth supported - SWR_ASSERT(depthBpc == 32); - - // matches shader precision, no quantizing needed - return depth; - } - - // should be unorm depth if not float - SWR_ASSERT(depthType == SWR_TYPE_UNORM); - - float quantize = (float)((1 << depthBpc) - 1); - simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize)); - result = _simd_add_ps(result, _simd_set1_ps(0.5f)); - result = _simd_round_ps(result, _MM_FROUND_TO_ZERO); - - if (depthBpc > 16) - { - result = _simd_div_ps(result, _simd_set1_ps(quantize)); - } - else - { - result = _simd_mul_ps(result, _simd_set1_ps(1.0f / quantize)); - } - - return result; -} - -INLINE -simdscalar DepthStencilTest(const API_STATE* pState, - bool frontFacing, - uint32_t viewportIndex, - simdscalar const& iZ, - uint8_t* pDepthBase, - simdscalar const& coverageMask, - uint8_t* pStencilBase, - simdscalar* pStencilMask) -{ - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); - static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format"); - - const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState; - const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex]; - - simdscalar depthResult = _simd_set1_ps(-1.0f); - simdscalar zbuf; - - // clamp Z to viewport [minZ..maxZ] - simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ); - simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ); - simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ)); - - if (pDSState->depthTestEnable) - { - switch (pDSState->depthTestFunc) - { - case ZFUNC_NEVER: - depthResult = _simd_setzero_ps(); - break; - case ZFUNC_ALWAYS: - break; - default: - zbuf = _simd_load_ps((const float*)pDepthBase); - } - - switch (pDSState->depthTestFunc) - { - case ZFUNC_LE: - depthResult = _simd_cmple_ps(interpZ, zbuf); - break; - case ZFUNC_LT: - depthResult = _simd_cmplt_ps(interpZ, zbuf); - break; - case ZFUNC_GT: - depthResult = _simd_cmpgt_ps(interpZ, zbuf); - break; - case ZFUNC_GE: - depthResult = _simd_cmpge_ps(interpZ, zbuf); - break; - case ZFUNC_EQ: - depthResult = _simd_cmpeq_ps(interpZ, zbuf); - break; - case ZFUNC_NE: - depthResult = _simd_cmpneq_ps(interpZ, zbuf); - break; - } - } - - simdscalar stencilMask = _simd_set1_ps(-1.0f); - - if (pDSState->stencilTestEnable) - { - uint8_t stencilRefValue; - uint32_t stencilTestFunc; - uint8_t stencilTestMask; - if (frontFacing || !pDSState->doubleSidedStencilTestEnable) - { - stencilRefValue = pDSState->stencilRefValue; - stencilTestFunc = pDSState->stencilTestFunc; - stencilTestMask = pDSState->stencilTestMask; - } - else - { - stencilRefValue = pDSState->backfaceStencilRefValue; - stencilTestFunc = pDSState->backfaceStencilTestFunc; - stencilTestMask = pDSState->backfaceStencilTestMask; - } - - simdvector sbuf; - simdscalar stencilWithMask; - simdscalar stencilRef; - switch (stencilTestFunc) - { - case ZFUNC_NEVER: - stencilMask = _simd_setzero_ps(); - break; - case ZFUNC_ALWAYS: - break; - default: - LoadSOA<R8_UINT>(pStencilBase, sbuf); - - // apply stencil read mask - stencilWithMask = _simd_castsi_ps( - _simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask))); - - // do stencil compare in float to avoid simd integer emulation in AVX1 - stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask)); - - stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask)); - break; - } - - switch (stencilTestFunc) - { - case ZFUNC_LE: - stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); - break; - case ZFUNC_LT: - stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); - break; - case ZFUNC_GT: - stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); - break; - case ZFUNC_GE: - stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); - break; - case ZFUNC_EQ: - stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); - break; - case ZFUNC_NE: - stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); - break; - } - } - - simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask); - depthWriteMask = _simd_and_ps(depthWriteMask, coverageMask); - - *pStencilMask = stencilMask; - return depthWriteMask; -} - -INLINE -void DepthStencilWrite(const SWR_VIEWPORT* pViewport, - const SWR_DEPTH_STENCIL_STATE* pDSState, - bool frontFacing, - simdscalar const& iZ, - uint8_t* pDepthBase, - const simdscalar& depthMask, - const simdscalar& coverageMask, - uint8_t* pStencilBase, - const simdscalar& stencilMask) -{ - if (pDSState->depthWriteEnable) - { - // clamp Z to viewport [minZ..maxZ] - simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ); - simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ); - simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ)); - - simdscalar vMask = _simd_and_ps(depthMask, coverageMask); - _simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ); - } - - if (pDSState->stencilWriteEnable) - { - simdvector sbuf; - LoadSOA<R8_UINT>(pStencilBase, sbuf); - simdscalar stencilbuf = sbuf.v[0]; - - uint8_t stencilRefValue; - uint32_t stencilFailOp; - uint32_t stencilPassDepthPassOp; - uint32_t stencilPassDepthFailOp; - uint8_t stencilWriteMask; - if (frontFacing || !pDSState->doubleSidedStencilTestEnable) - { - stencilRefValue = pDSState->stencilRefValue; - stencilFailOp = pDSState->stencilFailOp; - stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp; - stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp; - stencilWriteMask = pDSState->stencilWriteMask; - } - else - { - stencilRefValue = pDSState->backfaceStencilRefValue; - stencilFailOp = pDSState->backfaceStencilFailOp; - stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp; - stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp; - stencilWriteMask = pDSState->backfaceStencilWriteMask; - } - - simdscalar stencilps = stencilbuf; - simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue)); - - simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, coverageMask); - simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask); - simdscalar stencilPassDepthFailMask = - _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1))); - - simdscalar origStencil = stencilps; - - StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps); - StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, - stencilPassDepthFailMask, - stencilRefps, - stencilps); - StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, - stencilPassDepthPassMask, - stencilRefps, - stencilps); - - // apply stencil write mask - simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask); - stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask)); - stencilps = - _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps); - - simdvector stencilResult; - stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask); - StoreSOA<R8_UINT>(stencilResult, pStencilBase); - } -} diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp deleted file mode 100644 index 9a9cc2635df..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp +++ /dev/null @@ -1,138 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file fifo.hpp - * - * @brief Definitions for our fifos used for thread communication. - * - ******************************************************************************/ -#pragma once - -#include "common/os.h" -#include "arena.h" - -#include <vector> -#include <cassert> - -template <class T> -struct QUEUE -{ - OSALIGNLINE(volatile uint32_t) mLock{0}; - OSALIGNLINE(volatile uint32_t) mNumEntries{0}; - std::vector<T*> mBlocks; - T* mCurBlock{nullptr}; - uint32_t mHead{0}; - uint32_t mTail{0}; - uint32_t mCurBlockIdx{0}; - - // power of 2 - static const uint32_t mBlockSizeShift = 6; - static const uint32_t mBlockSize = 1 << mBlockSizeShift; - - template <typename ArenaT> - void clear(ArenaT& arena) - { - mHead = 0; - mTail = 0; - mBlocks.clear(); - T* pNewBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4); - mBlocks.push_back(pNewBlock); - mCurBlock = pNewBlock; - mCurBlockIdx = 0; - mNumEntries = 0; - mLock = 0; - } - - uint32_t getNumQueued() { return mNumEntries; } - - bool tryLock() - { - if (mLock) - { - return false; - } - - // try to lock the FIFO - long initial = InterlockedCompareExchange(&mLock, 1, 0); - return (initial == 0); - } - - void unlock() { mLock = 0; } - - T* peek() - { - if (mNumEntries == 0) - { - return nullptr; - } - uint32_t block = mHead >> mBlockSizeShift; - return &mBlocks[block][mHead & (mBlockSize - 1)]; - } - - void dequeue_noinc() - { - mHead++; - mNumEntries--; - } - - template <typename ArenaT> - bool enqueue_try_nosync(ArenaT& arena, const T* entry) - { - const float* pSrc = (const float*)entry; - float* pDst = (float*)&mCurBlock[mTail]; - - auto lambda = [&](int32_t i) { - __m256 vSrc = _mm256_load_ps(pSrc + i * KNOB_SIMD_WIDTH); - _mm256_stream_ps(pDst + i * KNOB_SIMD_WIDTH, vSrc); - }; - - const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH * 4); - static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T), - "FIFO element size should be multiple of SIMD width."); - - UnrollerL<0, numSimdLines, 1>::step(lambda); - - mTail++; - if (mTail == mBlockSize) - { - if (++mCurBlockIdx < mBlocks.size()) - { - mCurBlock = mBlocks[mCurBlockIdx]; - } - else - { - T* newBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4); - SWR_ASSERT(newBlock); - - mBlocks.push_back(newBlock); - mCurBlock = newBlock; - } - - mTail = 0; - } - - mNumEntries++; - return true; - } - - void destroy() {} -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h deleted file mode 100644 index f1ea06c4978..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h +++ /dev/null @@ -1,262 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file format_conversion.h - * - * @brief API implementation - * - ******************************************************************************/ -#include "format_types.h" -#include "format_traits.h" - -////////////////////////////////////////////////////////////////////////// -/// @brief Load SIMD packed pixels in SOA format and converts to -/// SOA RGBA32_FLOAT format. -/// @param pSrc - source data in SOA form -/// @param dst - output data in SOA form -template <typename SIMD_T, SWR_FORMAT SrcFormat> -INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, Vec4<SIMD_T>& dst) -{ - // fast path for float32 - if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && - (FormatTraits<SrcFormat>::GetBPC(0) == 32)) - { - auto lambda = [&](int comp) - { - Float<SIMD_T> vComp = - SIMD_T::load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(Float<SIMD_T>))); - - dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp; - }; - - UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda); - return; - } - - auto lambda = [&](int comp) - { - // load SIMD components - Float<SIMD_T> vComp; - FormatTraits<SrcFormat>::loadSOA(comp, pSrc, vComp); - - // unpack - vComp = FormatTraits<SrcFormat>::unpack(comp, vComp); - - // convert - if (FormatTraits<SrcFormat>::isNormalized(comp)) - { - vComp = SIMD_T::cvtepi32_ps(SIMD_T::castps_si(vComp)); - vComp = SIMD_T::mul_ps(vComp, SIMD_T::set1_ps(FormatTraits<SrcFormat>::toFloat(comp))); - } - - dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp; - - // is there a better way to get this from the SIMD traits? - const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float); - - pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * SIMD_WIDTH) / 8; - }; - - UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda); -} - -template <SWR_FORMAT SrcFormat> -INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simdvector& dst) -{ - LoadSOA<SIMD256, SrcFormat>(pSrc, dst); -} - -template <SWR_FORMAT SrcFormat> -INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst) -{ - LoadSOA<SIMD512, SrcFormat>(pSrc, dst); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Clamps the given component based on the requirements on the -/// Format template arg -/// @param vComp - SIMD vector of floats -/// @param Component - component -template <typename SIMD_T, SWR_FORMAT Format> -INLINE Float<SIMD_T> SIMDCALL Clamp(Float<SIMD_T> const& v, uint32_t Component) -{ - Float<SIMD_T> vComp = v; - if (Component >= 4 || Component < 0) - { - // Component shouldn't out of <0;3> range - assert(false); - return vComp; - } - if (FormatTraits<Format>::isNormalized(Component)) - { - if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM) - { - vComp = SIMD_T::max_ps(vComp, SIMD_T::setzero_ps()); - } - - if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM) - { - vComp = SIMD_T::max_ps(vComp, SIMD_T::set1_ps(-1.0f)); - } - vComp = SIMD_T::min_ps(vComp, SIMD_T::set1_ps(1.0f)); - } - else if (FormatTraits<Format>::GetBPC(Component) < 32) - { - if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT) - { - int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1; - int iMin = 0; - Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp); - vCompi = SIMD_T::max_epu32(vCompi, SIMD_T::set1_epi32(iMin)); - vCompi = SIMD_T::min_epu32(vCompi, SIMD_T::set1_epi32(iMax)); - vComp = SIMD_T::castsi_ps(vCompi); - } - else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT) - { - int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1; - int iMin = -1 - iMax; - Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp); - vCompi = SIMD_T::max_epi32(vCompi, SIMD_T::set1_epi32(iMin)); - vCompi = SIMD_T::min_epi32(vCompi, SIMD_T::set1_epi32(iMax)); - vComp = SIMD_T::castsi_ps(vCompi); - } - } - - return vComp; -} - -template <SWR_FORMAT Format> -INLINE simdscalar SIMDCALL Clamp(simdscalar const& v, uint32_t Component) -{ - return Clamp<SIMD256, Format>(v, Component); -} - -template <SWR_FORMAT Format> -INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component) -{ - return Clamp<SIMD512, Format>(v, Component); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Normalize the given component based on the requirements on the -/// Format template arg -/// @param vComp - SIMD vector of floats -/// @param Component - component -template <typename SIMD_T, SWR_FORMAT Format> -INLINE Float<SIMD_T> SIMDCALL Normalize(Float<SIMD_T> const& vComp, uint32_t Component) -{ - Float<SIMD_T> r = vComp; - if (FormatTraits<Format>::isNormalized(Component)) - { - r = SIMD_T::mul_ps(r, SIMD_T::set1_ps(FormatTraits<Format>::fromFloat(Component))); - r = SIMD_T::castsi_ps(SIMD_T::cvtps_epi32(r)); - } - return r; -} - -template <SWR_FORMAT Format> -INLINE simdscalar SIMDCALL Normalize(simdscalar const& vComp, uint32_t Component) -{ - return Normalize<SIMD256, Format>(vComp, Component); -} - -template <SWR_FORMAT Format> -INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component) -{ - return Normalize<SIMD512, Format>(vComp, Component); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Convert and store simdvector of pixels in SOA -/// RGBA32_FLOAT to SOA format -/// @param src - source data in SOA form -/// @param dst - output data in SOA form -template <typename SIMD_T, SWR_FORMAT DstFormat> -INLINE void SIMDCALL StoreSOA(const Vec4<SIMD_T>& src, uint8_t* pDst) -{ - // fast path for float32 - if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && - (FormatTraits<DstFormat>::GetBPC(0) == 32)) - { - for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) - { - Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; - - // Gamma-correct - if (FormatTraits<DstFormat>::isSRGB) - { - if (comp < 3) // Input format is always RGBA32_FLOAT. - { - vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp); - } - } - - SIMD_T::store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp); - } - return; - } - - auto lambda = [&](int comp) { - Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; - - // Gamma-correct - if (FormatTraits<DstFormat>::isSRGB) - { - if (comp < 3) // Input format is always RGBA32_FLOAT. - { - vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp); - } - } - - // clamp - vComp = Clamp<SIMD_T, DstFormat>(vComp, comp); - - // normalize - vComp = Normalize<SIMD_T, DstFormat>(vComp, comp); - - // pack - vComp = FormatTraits<DstFormat>::pack(comp, vComp); - - // store - FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp); - - // is there a better way to get this from the SIMD traits? - const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float); - - pDst += (FormatTraits<DstFormat>::GetBPC(comp) * SIMD_WIDTH) / 8; - }; - - UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda); -} - -template <SWR_FORMAT DstFormat> -INLINE void SIMDCALL StoreSOA(const simdvector& src, uint8_t* pDst) -{ - StoreSOA<SIMD256, DstFormat>(src, pDst); -} - -template <SWR_FORMAT DstFormat> -INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) -{ - StoreSOA<SIMD512, DstFormat>(src, pDst); -} - diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h deleted file mode 100644 index 97e7d56e48e..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/format_traits.h +++ /dev/null @@ -1,4046 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file format_traits.h - * - * @brief Format Traits. auto-generated file - * - * DO NOT EDIT - * - ******************************************************************************/ -#pragma once - -#include "format_types.h" -#include "format_utils.h" - -////////////////////////////////////////////////////////////////////////// -/// FormatSwizzle - Component swizzle selects -////////////////////////////////////////////////////////////////////////// -template <uint32_t comp0 = 0, uint32_t comp1 = 0, uint32_t comp2 = 0, uint32_t comp3 = 0> -struct FormatSwizzle -{ - // Return swizzle select for component. - INLINE static uint32_t swizzle(uint32_t c) - { - static const uint32_t s[4] = {comp0, comp1, comp2, comp3}; - return s[c]; - } -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits - Format traits -////////////////////////////////////////////////////////////////////////// -template <SWR_FORMAT format> -struct FormatTraits : ComponentTraits<SWR_TYPE_UNKNOWN, 0>, FormatSwizzle<0>, Defaults<0, 0, 0, 0> -{ - static const uint32_t bpp{0}; - static const uint32_t numComps{0}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32A32_FLOAT> - Format traits specialization for R32G32B32A32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32A32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, - 32, - SWR_TYPE_FLOAT, - 32, - SWR_TYPE_FLOAT, - 32, - SWR_TYPE_FLOAT, - 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32_32 TransposeT; - typedef Format4<32, 32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32A32_SINT> - Format traits specialization for R32G32B32A32_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32A32_SINT> - : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32_32 TransposeT; - typedef Format4<32, 32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32A32_UINT> - Format traits specialization for R32G32B32A32_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32A32_UINT> - : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32_32 TransposeT; - typedef Format4<32, 32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R64G64_FLOAT> - Format traits specialization for R64G64_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R64G64_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose64_64 TransposeT; - typedef Format2<64, 64> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32X32_FLOAT> - Format traits specialization for R32G32B32X32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, - 32, - SWR_TYPE_FLOAT, - 32, - SWR_TYPE_FLOAT, - 32, - SWR_TYPE_UNUSED, - 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32_32 TransposeT; - typedef Format4<32, 32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32A32_SSCALED> - Format traits specialization for R32G32B32A32_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32A32_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, - 32, - SWR_TYPE_SSCALED, - 32, - SWR_TYPE_SSCALED, - 32, - SWR_TYPE_SSCALED, - 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32_32 TransposeT; - typedef Format4<32, 32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32A32_USCALED> - Format traits specialization for R32G32B32A32_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32A32_USCALED> : ComponentTraits<SWR_TYPE_USCALED, - 32, - SWR_TYPE_USCALED, - 32, - SWR_TYPE_USCALED, - 32, - SWR_TYPE_USCALED, - 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32_32 TransposeT; - typedef Format4<32, 32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32A32_SFIXED> - Format traits specialization for R32G32B32A32_SFIXED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32A32_SFIXED> : ComponentTraits<SWR_TYPE_SFIXED, - 32, - SWR_TYPE_SFIXED, - 32, - SWR_TYPE_SFIXED, - 32, - SWR_TYPE_SFIXED, - 32>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32_32 TransposeT; - typedef Format4<32, 32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32_FLOAT> - Format traits specialization for R32G32B32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{96}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32 TransposeT; - typedef Format3<32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32_SINT> - Format traits specialization for R32G32B32_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32_SINT> - : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{96}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32 TransposeT; - typedef Format3<32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32_UINT> - Format traits specialization for R32G32B32_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32_UINT> - : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{96}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32 TransposeT; - typedef Format3<32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32_SSCALED> - Format traits specialization for R32G32B32_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32_SSCALED> - : ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{96}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32 TransposeT; - typedef Format3<32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32_USCALED> - Format traits specialization for R32G32B32_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32_USCALED> - : ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{96}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32 TransposeT; - typedef Format3<32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32B32_SFIXED> - Format traits specialization for R32G32B32_SFIXED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32B32_SFIXED> - : ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{96}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32_32 TransposeT; - typedef Format3<32, 32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16A16_UNORM> - Format traits specialization for R16G16B16A16_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16A16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, - 16, - SWR_TYPE_UNORM, - 16, - SWR_TYPE_UNORM, - 16, - SWR_TYPE_UNORM, - 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16_16 TransposeT; - typedef Format4<16, 16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16A16_SNORM> - Format traits specialization for R16G16B16A16_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16A16_SNORM> : ComponentTraits<SWR_TYPE_SNORM, - 16, - SWR_TYPE_SNORM, - 16, - SWR_TYPE_SNORM, - 16, - SWR_TYPE_SNORM, - 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16_16 TransposeT; - typedef Format4<16, 16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16A16_SINT> - Format traits specialization for R16G16B16A16_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16A16_SINT> - : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16_16 TransposeT; - typedef Format4<16, 16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16A16_UINT> - Format traits specialization for R16G16B16A16_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16A16_UINT> - : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16_16 TransposeT; - typedef Format4<16, 16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16A16_FLOAT> - Format traits specialization for R16G16B16A16_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16A16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, - 16, - SWR_TYPE_FLOAT, - 16, - SWR_TYPE_FLOAT, - 16, - SWR_TYPE_FLOAT, - 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16_16 TransposeT; - typedef Format4<16, 16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32_FLOAT> - Format traits specialization for R32G32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32_SINT> - Format traits specialization for R32G32_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32_SINT> : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32_UINT> - Format traits specialization for R32G32_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32_UINT> : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32_FLOAT_X8X24_TYPELESS> - Format traits specialization for -/// R32_FLOAT_X8X24_TYPELESS -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32_FLOAT_X8X24_TYPELESS> - : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<X32_TYPELESS_G8X24_UINT> - Format traits specialization for X32_TYPELESS_G8X24_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<X32_TYPELESS_G8X24_UINT> - : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UNUSED, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L32A32_FLOAT> - Format traits specialization for L32A32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L32A32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{2}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{1}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R64_FLOAT> - Format traits specialization for R64_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R64_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 64>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<64> TransposeT; - typedef Format1<64> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16X16_UNORM> - Format traits specialization for R16G16B16X16_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16X16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, - 16, - SWR_TYPE_UNORM, - 16, - SWR_TYPE_UNORM, - 16, - SWR_TYPE_UNUSED, - 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16_16 TransposeT; - typedef Format4<16, 16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16X16_FLOAT> - Format traits specialization for R16G16B16X16_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16X16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, - 16, - SWR_TYPE_FLOAT, - 16, - SWR_TYPE_FLOAT, - 16, - SWR_TYPE_UNUSED, - 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16_16 TransposeT; - typedef Format4<16, 16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L32X32_FLOAT> - Format traits specialization for L32X32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<I32X32_FLOAT> - Format traits specialization for I32X32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<I32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16A16_SSCALED> - Format traits specialization for R16G16B16A16_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16A16_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, - 16, - SWR_TYPE_SSCALED, - 16, - SWR_TYPE_SSCALED, - 16, - SWR_TYPE_SSCALED, - 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16_16 TransposeT; - typedef Format4<16, 16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16A16_USCALED> - Format traits specialization for R16G16B16A16_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16A16_USCALED> : ComponentTraits<SWR_TYPE_USCALED, - 16, - SWR_TYPE_USCALED, - 16, - SWR_TYPE_USCALED, - 16, - SWR_TYPE_USCALED, - 16>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16_16 TransposeT; - typedef Format4<16, 16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32_SSCALED> - Format traits specialization for R32G32_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32_USCALED> - Format traits specialization for R32G32_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32G32_SFIXED> - Format traits specialization for R32G32_SFIXED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32G32_SFIXED> : ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose32_32 TransposeT; - typedef Format2<32, 32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B8G8R8A8_UNORM> - Format traits specialization for B8G8R8A8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B8G8R8A8_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B8G8R8A8_UNORM_SRGB> - Format traits specialization for B8G8R8A8_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B8G8R8A8_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R10G10B10A2_UNORM> - Format traits specialization for R10G10B10A2_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R10G10B10A2_UNORM> : ComponentTraits<SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R10G10B10A2_UNORM_SRGB> - Format traits specialization for R10G10B10A2_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R10G10B10A2_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R10G10B10A2_UINT> - Format traits specialization for R10G10B10A2_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R10G10B10A2_UINT> - : ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8A8_UNORM> - Format traits specialization for R8G8B8A8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8A8_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8A8_UNORM_SRGB> - Format traits specialization for R8G8B8A8_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8A8_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8A8_SNORM> - Format traits specialization for R8G8B8A8_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8A8_SNORM> - : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8A8_SINT> - Format traits specialization for R8G8B8A8_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8A8_SINT> - : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8A8_UINT> - Format traits specialization for R8G8B8A8_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8A8_UINT> - : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16_UNORM> - Format traits specialization for R16G16_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16 TransposeT; - typedef Format2<16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16_SNORM> - Format traits specialization for R16G16_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16_SNORM> : ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16 TransposeT; - typedef Format2<16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16_SINT> - Format traits specialization for R16G16_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16_SINT> : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16 TransposeT; - typedef Format2<16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16_UINT> - Format traits specialization for R16G16_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16_UINT> : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16 TransposeT; - typedef Format2<16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16_FLOAT> - Format traits specialization for R16G16_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16 TransposeT; - typedef Format2<16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B10G10R10A2_UNORM> - Format traits specialization for B10G10R10A2_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B10G10R10A2_UNORM> : ComponentTraits<SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B10G10R10A2_UNORM_SRGB> - Format traits specialization for B10G10R10A2_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B10G10R10A2_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R11G11B10_FLOAT> - Format traits specialization for R11G11B10_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R11G11B10_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 10>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose11_11_10 TransposeT; - typedef Format3<11, 11, 10> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R10G10B10_FLOAT_A2_UNORM> - Format traits specialization for -/// R10G10B10_FLOAT_A2_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R10G10B10_FLOAT_A2_UNORM> : ComponentTraits<SWR_TYPE_FLOAT, - 10, - SWR_TYPE_FLOAT, - 10, - SWR_TYPE_FLOAT, - 10, - SWR_TYPE_UNORM, - 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32_SINT> - Format traits specialization for R32_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32_SINT> - : ComponentTraits<SWR_TYPE_SINT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32_UINT> - Format traits specialization for R32_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32_UINT> - : ComponentTraits<SWR_TYPE_UINT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32_FLOAT> - Format traits specialization for R32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R24_UNORM_X8_TYPELESS> - Format traits specialization for R24_UNORM_X8_TYPELESS -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R24_UNORM_X8_TYPELESS> - : ComponentTraits<SWR_TYPE_UNORM, 24>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<24> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<X24_TYPELESS_G8_UINT> - Format traits specialization for X24_TYPELESS_G8_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<X24_TYPELESS_G8_UINT> - : ComponentTraits<SWR_TYPE_UINT, 32>, FormatSwizzle<1>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L32_UNORM> - Format traits specialization for L32_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L32_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L16A16_UNORM> - Format traits specialization for L16A16_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L16A16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{1}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16 TransposeT; - typedef Format2<16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<I24X8_UNORM> - Format traits specialization for I24X8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<I24X8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose24_8 TransposeT; - typedef Format2<24, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L24X8_UNORM> - Format traits specialization for L24X8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L24X8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose24_8 TransposeT; - typedef Format2<24, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<I32_FLOAT> - Format traits specialization for I32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<I32_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L32_FLOAT> - Format traits specialization for L32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L32_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<A32_FLOAT> - Format traits specialization for A32_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<A32_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B8G8R8X8_UNORM> - Format traits specialization for B8G8R8X8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B8G8R8X8_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B8G8R8X8_UNORM_SRGB> - Format traits specialization for B8G8R8X8_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B8G8R8X8_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8X8_UNORM> - Format traits specialization for R8G8B8X8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8X8_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8X8_UNORM_SRGB> - Format traits specialization for R8G8B8X8_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8X8_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R9G9B9E5_SHAREDEXP> - Format traits specialization for R9G9B9E5_SHAREDEXP -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R9G9B9E5_SHAREDEXP> - : ComponentTraits<SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 5>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose9_9_9_5 TransposeT; - typedef Format4<9, 9, 9, 5> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B10G10R10X2_UNORM> - Format traits specialization for B10G10R10X2_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B10G10R10X2_UNORM> : ComponentTraits<SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNORM, - 10, - SWR_TYPE_UNUSED, - 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L16A16_FLOAT> - Format traits specialization for L16A16_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L16A16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{1}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16 TransposeT; - typedef Format2<16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R10G10B10X2_USCALED> - Format traits specialization for R10G10B10X2_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R10G10B10X2_USCALED> : ComponentTraits<SWR_TYPE_USCALED, - 10, - SWR_TYPE_USCALED, - 10, - SWR_TYPE_USCALED, - 10, - SWR_TYPE_UNUSED, - 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8A8_SSCALED> - Format traits specialization for R8G8B8A8_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8A8_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, - 8, - SWR_TYPE_SSCALED, - 8, - SWR_TYPE_SSCALED, - 8, - SWR_TYPE_SSCALED, - 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8A8_USCALED> - Format traits specialization for R8G8B8A8_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8A8_USCALED> : ComponentTraits<SWR_TYPE_USCALED, - 8, - SWR_TYPE_USCALED, - 8, - SWR_TYPE_USCALED, - 8, - SWR_TYPE_USCALED, - 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16_SSCALED> - Format traits specialization for R16G16_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16 TransposeT; - typedef Format2<16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16_USCALED> - Format traits specialization for R16G16_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16 TransposeT; - typedef Format2<16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32_SSCALED> - Format traits specialization for R32_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32_SSCALED> - : ComponentTraits<SWR_TYPE_SSCALED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32_USCALED> - Format traits specialization for R32_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32_USCALED> - : ComponentTraits<SWR_TYPE_USCALED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B5G6R5_UNORM> - Format traits specialization for B5G6R5_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B5G6R5_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>, - FormatSwizzle<2, 1, 0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose5_6_5 TransposeT; - typedef Format3<5, 6, 5> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B5G6R5_UNORM_SRGB> - Format traits specialization for B5G6R5_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B5G6R5_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>, - FormatSwizzle<2, 1, 0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose5_6_5 TransposeT; - typedef Format3<5, 6, 5> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B5G5R5A1_UNORM> - Format traits specialization for B5G5R5A1_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B5G5R5A1_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose5_5_5_1 TransposeT; - typedef Format4<5, 5, 5, 1> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B5G5R5A1_UNORM_SRGB> - Format traits specialization for B5G5R5A1_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B5G5R5A1_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose5_5_5_1 TransposeT; - typedef Format4<5, 5, 5, 1> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B4G4R4A4_UNORM> - Format traits specialization for B4G4R4A4_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B4G4R4A4_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose4_4_4_4 TransposeT; - typedef Format4<4, 4, 4, 4> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B4G4R4A4_UNORM_SRGB> - Format traits specialization for B4G4R4A4_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B4G4R4A4_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose4_4_4_4 TransposeT; - typedef Format4<4, 4, 4, 4> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8_UNORM> - Format traits specialization for R8G8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8 TransposeT; - typedef Format2<8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8_SNORM> - Format traits specialization for R8G8_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8_SNORM> : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8 TransposeT; - typedef Format2<8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8_SINT> - Format traits specialization for R8G8_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8_SINT> : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8 TransposeT; - typedef Format2<8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8_UINT> - Format traits specialization for R8G8_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8_UINT> : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8 TransposeT; - typedef Format2<8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16_UNORM> - Format traits specialization for R16_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16_SNORM> - Format traits specialization for R16_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16_SNORM> - : ComponentTraits<SWR_TYPE_SNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16_SINT> - Format traits specialization for R16_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16_SINT> - : ComponentTraits<SWR_TYPE_SINT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16_UINT> - Format traits specialization for R16_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16_UINT> - : ComponentTraits<SWR_TYPE_UINT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16_FLOAT> - Format traits specialization for R16_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<I16_UNORM> - Format traits specialization for I16_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<I16_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L16_UNORM> - Format traits specialization for L16_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L16_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<A16_UNORM> - Format traits specialization for A16_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<A16_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L8A8_UNORM> - Format traits specialization for L8A8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L8A8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{2}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{1}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8 TransposeT; - typedef Format2<8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<I16_FLOAT> - Format traits specialization for I16_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<I16_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L16_FLOAT> - Format traits specialization for L16_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L16_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<A16_FLOAT> - Format traits specialization for A16_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<A16_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L8A8_UNORM_SRGB> - Format traits specialization for L8A8_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L8A8_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{2}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{1}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8 TransposeT; - typedef Format2<8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B5G5R5X1_UNORM> - Format traits specialization for B5G5R5X1_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B5G5R5X1_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose5_5_5_1 TransposeT; - typedef Format4<5, 5, 5, 1> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B5G5R5X1_UNORM_SRGB> - Format traits specialization for B5G5R5X1_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B5G5R5X1_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose5_5_5_1 TransposeT; - typedef Format4<5, 5, 5, 1> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8_SSCALED> - Format traits specialization for R8G8_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8 TransposeT; - typedef Format2<8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8_USCALED> - Format traits specialization for R8G8_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>, - FormatSwizzle<0, 1>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{2}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8 TransposeT; - typedef Format2<8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16_SSCALED> - Format traits specialization for R16_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16_SSCALED> - : ComponentTraits<SWR_TYPE_SSCALED, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16_USCALED> - Format traits specialization for R16_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16_USCALED> - : ComponentTraits<SWR_TYPE_USCALED, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<16> TransposeT; - typedef Format1<16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<A1B5G5R5_UNORM> - Format traits specialization for A1B5G5R5_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<A1B5G5R5_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 1, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5>, - FormatSwizzle<3, 2, 1, 0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose1_5_5_5 TransposeT; - typedef Format4<1, 5, 5, 5> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<A4B4G4R4_UNORM> - Format traits specialization for A4B4G4R4_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<A4B4G4R4_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>, - FormatSwizzle<3, 2, 1, 0>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose4_4_4_4 TransposeT; - typedef Format4<4, 4, 4, 4> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L8A8_UINT> - Format traits specialization for L8A8_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L8A8_UINT> : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{2}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{1}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8 TransposeT; - typedef Format2<8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L8A8_SINT> - Format traits specialization for L8A8_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L8A8_SINT> : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, - FormatSwizzle<0, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{16}; - static const uint32_t numComps{2}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{1}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8 TransposeT; - typedef Format2<8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8_UNORM> - Format traits specialization for R8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8_SNORM> - Format traits specialization for R8_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8_SNORM> - : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8_SINT> - Format traits specialization for R8_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8_SINT> - : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8_UINT> - Format traits specialization for R8_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8_UINT> - : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<A8_UNORM> - Format traits specialization for A8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<A8_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<I8_UNORM> - Format traits specialization for I8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<I8_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L8_UNORM> - Format traits specialization for L8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L8_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8_SSCALED> - Format traits specialization for R8_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8_SSCALED> - : ComponentTraits<SWR_TYPE_SSCALED, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8_USCALED> - Format traits specialization for R8_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8_USCALED> - : ComponentTraits<SWR_TYPE_USCALED, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L8_UNORM_SRGB> - Format traits specialization for L8_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L8_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L8_UINT> - Format traits specialization for L8_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L8_UINT> - : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<L8_SINT> - Format traits specialization for L8_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<L8_SINT> - : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<I8_UINT> - Format traits specialization for I8_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<I8_UINT> - : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<I8_SINT> - Format traits specialization for I8_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<I8_SINT> - : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<DXT1_RGB_SRGB> - Format traits specialization for DXT1_RGB_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<DXT1_RGB_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<YCRCB_SWAPUVY> - Format traits specialization for YCRCB_SWAPUVY -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<YCRCB_SWAPUVY> - : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{true}; - static const uint32_t bcWidth{2}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC1_UNORM> - Format traits specialization for BC1_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC1_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC2_UNORM> - Format traits specialization for BC2_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC2_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC3_UNORM> - Format traits specialization for BC3_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC3_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC4_UNORM> - Format traits specialization for BC4_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC4_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC5_UNORM> - Format traits specialization for BC5_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC5_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC1_UNORM_SRGB> - Format traits specialization for BC1_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC1_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{true}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC2_UNORM_SRGB> - Format traits specialization for BC2_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC2_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{true}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC3_UNORM_SRGB> - Format traits specialization for BC3_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC3_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{true}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<YCRCB_SWAPUV> - Format traits specialization for YCRCB_SWAPUV -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<YCRCB_SWAPUV> - : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{true}; - static const uint32_t bcWidth{2}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8_8 TransposeT; - typedef Format4<8, 8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<DXT1_RGB> - Format traits specialization for DXT1_RGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<DXT1_RGB> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8_UNORM> - Format traits specialization for R8G8B8_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{24}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8 TransposeT; - typedef Format3<8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8_SNORM> - Format traits specialization for R8G8B8_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8_SNORM> - : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{24}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8 TransposeT; - typedef Format3<8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8_SSCALED> - Format traits specialization for R8G8B8_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8_SSCALED> - : ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{24}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8 TransposeT; - typedef Format3<8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8_USCALED> - Format traits specialization for R8G8B8_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8_USCALED> - : ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{24}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8 TransposeT; - typedef Format3<8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R64G64B64A64_FLOAT> - Format traits specialization for R64G64B64A64_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R64G64B64A64_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, - 64, - SWR_TYPE_FLOAT, - 64, - SWR_TYPE_FLOAT, - 64, - SWR_TYPE_FLOAT, - 64>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{256}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose64_64_64_64 TransposeT; - typedef Format4<64, 64, 64, 64> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R64G64B64_FLOAT> - Format traits specialization for R64G64B64_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R64G64B64_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{192}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose64_64_64 TransposeT; - typedef Format3<64, 64, 64> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC4_SNORM> - Format traits specialization for BC4_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC4_SNORM> - : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{64}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC5_SNORM> - Format traits specialization for BC5_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC5_SNORM> - : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16_FLOAT> - Format traits specialization for R16G16B16_FLOAT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16_FLOAT> - : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{48}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16 TransposeT; - typedef Format3<16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16_UNORM> - Format traits specialization for R16G16B16_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{48}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16 TransposeT; - typedef Format3<16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16_SNORM> - Format traits specialization for R16G16B16_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16_SNORM> - : ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{48}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16 TransposeT; - typedef Format3<16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16_SSCALED> - Format traits specialization for R16G16B16_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16_SSCALED> - : ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{48}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16 TransposeT; - typedef Format3<16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16_USCALED> - Format traits specialization for R16G16B16_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16_USCALED> - : ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{48}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16 TransposeT; - typedef Format3<16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC6H_SF16> - Format traits specialization for BC6H_SF16 -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC6H_SF16> - : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC7_UNORM> - Format traits specialization for BC7_UNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC7_UNORM> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC7_UNORM_SRGB> - Format traits specialization for BC7_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC7_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{true}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<BC6H_UF16> - Format traits specialization for BC6H_UF16 -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<BC6H_UF16> - : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{128}; - static const uint32_t numComps{1}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{true}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{4}; - static const uint32_t bcHeight{4}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8_UNORM_SRGB> - Format traits specialization for R8G8B8_UNORM_SRGB -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8_UNORM_SRGB> - : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{24}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{true}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8 TransposeT; - typedef Format3<8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16_UINT> - Format traits specialization for R16G16B16_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16_UINT> - : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{48}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16 TransposeT; - typedef Format3<16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R16G16B16_SINT> - Format traits specialization for R16G16B16_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R16G16B16_SINT> - : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{48}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose16_16_16 TransposeT; - typedef Format3<16, 16, 16> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R32_SFIXED> - Format traits specialization for R32_SFIXED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R32_SFIXED> - : ComponentTraits<SWR_TYPE_SFIXED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<32> TransposeT; - typedef Format1<32> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R10G10B10A2_SNORM> - Format traits specialization for R10G10B10A2_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R10G10B10A2_SNORM> : ComponentTraits<SWR_TYPE_SNORM, - 10, - SWR_TYPE_SNORM, - 10, - SWR_TYPE_SNORM, - 10, - SWR_TYPE_SNORM, - 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R10G10B10A2_USCALED> - Format traits specialization for R10G10B10A2_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R10G10B10A2_USCALED> : ComponentTraits<SWR_TYPE_USCALED, - 10, - SWR_TYPE_USCALED, - 10, - SWR_TYPE_USCALED, - 10, - SWR_TYPE_USCALED, - 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R10G10B10A2_SSCALED> - Format traits specialization for R10G10B10A2_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R10G10B10A2_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, - 10, - SWR_TYPE_SSCALED, - 10, - SWR_TYPE_SSCALED, - 10, - SWR_TYPE_SSCALED, - 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R10G10B10A2_SINT> - Format traits specialization for R10G10B10A2_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R10G10B10A2_SINT> - : ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>, - FormatSwizzle<0, 1, 2, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B10G10R10A2_SNORM> - Format traits specialization for B10G10R10A2_SNORM -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B10G10R10A2_SNORM> : ComponentTraits<SWR_TYPE_SNORM, - 10, - SWR_TYPE_SNORM, - 10, - SWR_TYPE_SNORM, - 10, - SWR_TYPE_SNORM, - 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B10G10R10A2_USCALED> - Format traits specialization for B10G10R10A2_USCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B10G10R10A2_USCALED> : ComponentTraits<SWR_TYPE_USCALED, - 10, - SWR_TYPE_USCALED, - 10, - SWR_TYPE_USCALED, - 10, - SWR_TYPE_USCALED, - 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B10G10R10A2_SSCALED> - Format traits specialization for B10G10R10A2_SSCALED -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B10G10R10A2_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, - 10, - SWR_TYPE_SSCALED, - 10, - SWR_TYPE_SSCALED, - 10, - SWR_TYPE_SSCALED, - 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x3f800000> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B10G10R10A2_UINT> - Format traits specialization for B10G10R10A2_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B10G10R10A2_UINT> - : ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<B10G10R10A2_SINT> - Format traits specialization for B10G10R10A2_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<B10G10R10A2_SINT> - : ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>, - FormatSwizzle<2, 1, 0, 3>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{32}; - static const uint32_t numComps{4}; - static const bool hasAlpha{true}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose10_10_10_2 TransposeT; - typedef Format4<10, 10, 10, 2> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8_UINT> - Format traits specialization for R8G8B8_UINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8_UINT> - : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{24}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8 TransposeT; - typedef Format3<8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<R8G8B8_SINT> - Format traits specialization for R8G8B8_SINT -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<R8G8B8_SINT> - : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>, - FormatSwizzle<0, 1, 2>, - Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{24}; - static const uint32_t numComps{3}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{0}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef Transpose8_8_8 TransposeT; - typedef Format3<8, 8, 8> FormatT; -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatTraits<RAW> - Format traits specialization for RAW -////////////////////////////////////////////////////////////////////////// -template <> -struct FormatTraits<RAW> - : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1> -{ - static const uint32_t bpp{8}; - static const uint32_t numComps{1}; - static const bool hasAlpha{false}; - static const uint32_t alphaComp{3}; - static const bool isSRGB{false}; - static const bool isBC{false}; - static const bool isSubsampled{false}; - static const uint32_t bcWidth{1}; - static const uint32_t bcHeight{1}; - - typedef TransposeSingleComponent<8> TransposeT; - typedef Format1<8> FormatT; -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h deleted file mode 100644 index 7d7dd843349..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h +++ /dev/null @@ -1,1629 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file formats.h - * - * @brief Definitions for SWR_FORMAT functions. - * - ******************************************************************************/ -#pragma once - -#include "utils.h" -#include "common/simdintrin.h" - -////////////////////////////////////////////////////////////////////////// -/// PackTraits - Helpers for packing / unpacking same pixel sizes -////////////////////////////////////////////////////////////////////////// -template <uint32_t NumBits, bool Signed = false> -struct PackTraits -{ - static const uint32_t MyNumBits = NumBits; - - static simdscalar loadSOA(const uint8_t* pSrc) = delete; - static void storeSOA(uint8_t* pDst, simdscalar const& src) = delete; - static simdscalar unpack(simdscalar& in) = delete; - static simdscalar pack(simdscalar& in) = delete; - - static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete; - static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) = delete; - static simd16scalar unpack(simd16scalar& in) = delete; - static simd16scalar pack(simd16scalar& in) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// PackTraits - Helpers for packing / unpacking unused channels -////////////////////////////////////////////////////////////////////////// -template <> -struct PackTraits<0, false> -{ - static const uint32_t MyNumBits = 0; - - static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_setzero_ps(); } - static void storeSOA(uint8_t* pDst, simdscalar const& src) { return; } - static simdscalar unpack(simdscalar& in) { return _simd_setzero_ps(); } - static simdscalar pack(simdscalar& in) { return _simd_setzero_ps(); } - - static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); } - static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { return; } - static simd16scalar unpack(simd16scalar& in) { return _simd16_setzero_ps(); } - static simd16scalar pack(simd16scalar& in) { return _simd16_setzero_ps(); } -}; - -////////////////////////////////////////////////////////////////////////// -/// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels -////////////////////////////////////////////////////////////////////////// -template <> -struct PackTraits<8, false> -{ - static const uint32_t MyNumBits = 8; - - static simdscalar loadSOA(const uint8_t* pSrc) - { -#if KNOB_SIMD_WIDTH == 8 - __m256 result = _mm256_setzero_ps(); - __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); - return _mm256_insertf128_ps(result, vLo, 0); -#else -#error Unsupported vector width -#endif - } - - static void storeSOA(uint8_t* pDst, simdscalar const& src) - { - // store simd bytes -#if KNOB_SIMD_WIDTH == 8 - _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); -#else -#error Unsupported vector width -#endif - } - - static simdscalar unpack(simdscalar& in) - { -#if KNOB_SIMD_WIDTH == 8 -#if KNOB_ARCH <= KNOB_ARCH_AVX - __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); - __m128i resLo = _mm_cvtepu8_epi32(src); - __m128i resHi = - _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); - - __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); - return simdscalar{_mm256_castsi256_ps(result)}; -#else - return _mm256_castsi256_ps( - _mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); -#endif -#else -#error Unsupported vector width -#endif - } - - static simdscalar pack(simdscalar& in) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalari src = _simd_castps_si(in); - __m128i res16 = - _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); - __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128()); - return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); -#else -#error Unsupported vector width -#endif - } - - static simd16scalar loadSOA_16(const uint8_t* pSrc) - { - simd16scalar result = _simd16_setzero_ps(); - simdscalar resultlo = _simd_setzero_ps(); - - const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc)); - - resultlo = _mm256_insertf128_ps(resultlo, src, 0); - result = _simd16_insert_ps(result, resultlo, 0); - - return result; - } - - static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) - { - // store simd16 bytes - _mm_store_ps(reinterpret_cast<float*>(pDst), - _mm256_castps256_ps128(_simd16_extract_ps(src, 0))); - } - - static simd16scalar unpack(simd16scalar& in) - { - simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))); - simd16scalari result = _simd16_cvtepu8_epi32(tmp); - - return _simd16_castsi_ps(result); - } - - static simd16scalar pack(simd16scalar& in) - { - // clang-format off - - simd16scalari result = _simd16_setzero_si(); - - simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) - simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF - - simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) - simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) - - simdscalari pack = _simd_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) - - const simdscalari zero = _simd_setzero_si(); - - permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) - permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) - - pack = _simd_packus_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) - - result = _simd16_insert_si(result, pack, 0); - - return _simd16_castsi_ps(result); - - // clang-format on - } -}; - -////////////////////////////////////////////////////////////////////////// -/// PackTraits - Helpers for packing / unpacking 8 bit signed channels -////////////////////////////////////////////////////////////////////////// -template <> -struct PackTraits<8, true> -{ - static const uint32_t MyNumBits = 8; - - static simdscalar loadSOA(const uint8_t* pSrc) - { -#if KNOB_SIMD_WIDTH == 8 - __m256 result = _mm256_setzero_ps(); - __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); - return _mm256_insertf128_ps(result, vLo, 0); -#else -#error Unsupported vector width -#endif - } - - static void storeSOA(uint8_t* pDst, simdscalar const& src) - { - // store simd bytes -#if KNOB_SIMD_WIDTH == 8 - _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); -#else -#error Unsupported vector width -#endif - } - - static simdscalar unpack(simdscalar& in) - { -#if KNOB_SIMD_WIDTH == 8 -#if KNOB_ARCH <= KNOB_ARCH_AVX - SWR_INVALID("I think this may be incorrect."); - __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); - __m128i resLo = _mm_cvtepi8_epi32(src); - __m128i resHi = - _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); - - __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); - return _mm256_castsi256_ps(result); -#else - return _mm256_castsi256_ps( - _mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); -#endif -#else -#error Unsupported vector width -#endif - } - - static simdscalar pack(simdscalar& in) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalari src = _simd_castps_si(in); - __m128i res16 = - _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); - __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128()); - return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); -#else -#error Unsupported vector width -#endif - } - - static simd16scalar loadSOA_16(const uint8_t* pSrc) - { - simd16scalar result = _simd16_setzero_ps(); - simdscalar resultlo = _simd_setzero_ps(); - - const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc)); - - resultlo = _mm256_insertf128_ps(resultlo, src, 0); - result = _simd16_insert_ps(result, resultlo, 0); - - return result; - } - - static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) - { - // store simd16 bytes - _mm_store_ps(reinterpret_cast<float*>(pDst), - _mm256_castps256_ps128(_simd16_extract_ps(src, 0))); - } - - static simd16scalar unpack(simd16scalar& in) - { - simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))); - simd16scalari result = _simd16_cvtepu8_epi32(tmp); - - return _simd16_castsi_ps(result); - } - - static simd16scalar pack(simd16scalar& in) - { - // clang-format off - - simd16scalari result = _simd16_setzero_si(); - - simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) - simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF - - simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) - simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) - - simdscalari pack = _simd_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) - - const simdscalari zero = _simd_setzero_si(); - - permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) - permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) - - pack = _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) - - result = _simd16_insert_si(result, pack, 0); - - return _simd16_castsi_ps(result); - - // clang-format on - } -}; - -////////////////////////////////////////////////////////////////////////// -/// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels -////////////////////////////////////////////////////////////////////////// -template <> -struct PackTraits<16, false> -{ - static const uint32_t MyNumBits = 16; - - static simdscalar loadSOA(const uint8_t* pSrc) - { -#if KNOB_SIMD_WIDTH == 8 - __m256 result = _mm256_setzero_ps(); - __m128 vLo = _mm_load_ps((const float*)pSrc); - return _mm256_insertf128_ps(result, vLo, 0); -#else -#error Unsupported vector width -#endif - } - - static void storeSOA(uint8_t* pDst, simdscalar const& src) - { -#if KNOB_SIMD_WIDTH == 8 - // store 16B (2B * 8) - _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); -#else -#error Unsupported vector width -#endif - } - - static simdscalar unpack(simdscalar& in) - { -#if KNOB_SIMD_WIDTH == 8 -#if KNOB_ARCH <= KNOB_ARCH_AVX - __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); - __m128i resLo = _mm_cvtepu16_epi32(src); - __m128i resHi = - _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); - - __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); - return _mm256_castsi256_ps(result); -#else - return _mm256_castsi256_ps( - _mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); -#endif -#else -#error Unsupported vector width -#endif - } - - static simdscalar pack(simdscalar& in) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalari src = _simd_castps_si(in); - __m256i res = _mm256_castsi128_si256( - _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); - return _mm256_castsi256_ps(res); -#else -#error Unsupported vector width -#endif - } - - static simd16scalar loadSOA_16(const uint8_t* pSrc) - { - simd16scalar result = _simd16_setzero_ps(); - - simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc)); - - result = _simd16_insert_ps(result, resultlo, 0); - - return result; - } - - static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) - { - _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0)); - } - - static simd16scalar unpack(simd16scalar& in) - { - simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0))); - - return _simd16_castsi_ps(result); - } - - static simd16scalar pack(simd16scalar& in) - { - // clang-format off - - const simd16scalari zero = _simd16_setzero_si(); - - simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) - simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 - - simd16scalari result = _simd16_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) - - return _simd16_castsi_ps(result); - - // clang-format on - } -}; - -////////////////////////////////////////////////////////////////////////// -/// PackTraits - Helpers for packing / unpacking 16 bit signed channels -////////////////////////////////////////////////////////////////////////// -template <> -struct PackTraits<16, true> -{ - static const uint32_t MyNumBits = 16; - - static simdscalar loadSOA(const uint8_t* pSrc) - { -#if KNOB_SIMD_WIDTH == 8 - __m256 result = _mm256_setzero_ps(); - __m128 vLo = _mm_load_ps((const float*)pSrc); - return _mm256_insertf128_ps(result, vLo, 0); -#else -#error Unsupported vector width -#endif - } - - static void storeSOA(uint8_t* pDst, simdscalar const& src) - { -#if KNOB_SIMD_WIDTH == 8 - // store 16B (2B * 8) - _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); -#else -#error Unsupported vector width -#endif - } - - static simdscalar unpack(simdscalar& in) - { -#if KNOB_SIMD_WIDTH == 8 -#if KNOB_ARCH <= KNOB_ARCH_AVX - SWR_INVALID("I think this may be incorrect."); - __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); - __m128i resLo = _mm_cvtepi16_epi32(src); - __m128i resHi = - _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); - - __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); - return _mm256_castsi256_ps(result); -#else - return _mm256_castsi256_ps( - _mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); -#endif -#else -#error Unsupported vector width -#endif - } - - static simdscalar pack(simdscalar& in) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalari src = _simd_castps_si(in); - __m256i res = _mm256_castsi128_si256( - _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); - return _mm256_castsi256_ps(res); -#else -#error Unsupported vector width -#endif - } - - static simd16scalar loadSOA_16(const uint8_t* pSrc) - { - simd16scalar result = _simd16_setzero_ps(); - - simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc)); - - result = _simd16_insert_ps(result, resultlo, 0); - - return result; - } - - static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) - { - _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0)); - } - - static simd16scalar unpack(simd16scalar& in) - { - simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0))); - - return _simd16_castsi_ps(result); - } - - static simd16scalar pack(simd16scalar& in) - { - // clang-format off - - const simd16scalari zero = _simd16_setzero_si(); - - simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) - simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 - - simd16scalari result = _simd16_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) - - return _simd16_castsi_ps(result); - - // clang-format on - } -}; - -////////////////////////////////////////////////////////////////////////// -/// PackTraits - Helpers for packing / unpacking 32 bit channels -////////////////////////////////////////////////////////////////////////// -template <> -struct PackTraits<32, false> -{ - static const uint32_t MyNumBits = 32; - - static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_load_ps((const float*)pSrc); } - static void storeSOA(uint8_t* pDst, simdscalar const& src) - { - _simd_store_ps((float*)pDst, src); - } - static simdscalar unpack(simdscalar& in) { return in; } - static simdscalar pack(simdscalar& in) { return in; } - - static simd16scalar loadSOA_16(const uint8_t* pSrc) - { - return _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); - } - - static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) - { - _simd16_store_ps(reinterpret_cast<float*>(pDst), src); - } - - static simd16scalar unpack(simd16scalar& in) { return in; } - - static simd16scalar pack(simd16scalar& in) { return in; } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits. -////////////////////////////////////////////////////////////////////////// -template <SWR_TYPE type, uint32_t NumBits> -struct TypeTraits : PackTraits<NumBits> -{ - static const SWR_TYPE MyType = type; - static float toFloat() { return 0.0; } - static float fromFloat() - { - SWR_NOT_IMPL; - return 0.0; - } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UINT8 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8> -{ - static const SWR_TYPE MyType = SWR_TYPE_UINT; - static float toFloat() { return 0.0; } - static float fromFloat() - { - SWR_NOT_IMPL; - return 0.0; - } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UINT8 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true> -{ - static const SWR_TYPE MyType = SWR_TYPE_SINT; - static float toFloat() { return 0.0; } - static float fromFloat() - { - SWR_NOT_IMPL; - return 0.0; - } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UINT16 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16> -{ - static const SWR_TYPE MyType = SWR_TYPE_UINT; - static float toFloat() { return 0.0; } - static float fromFloat() - { - SWR_NOT_IMPL; - return 0.0; - } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for SINT16 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true> -{ - static const SWR_TYPE MyType = SWR_TYPE_SINT; - static float toFloat() { return 0.0; } - static float fromFloat() - { - SWR_NOT_IMPL; - return 0.0; - } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UINT32 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32> -{ - static const SWR_TYPE MyType = SWR_TYPE_UINT; - static float toFloat() { return 0.0; } - static float fromFloat() - { - SWR_NOT_IMPL; - return 0.0; - } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UINT32 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32> -{ - static const SWR_TYPE MyType = SWR_TYPE_SINT; - static float toFloat() { return 0.0; } - static float fromFloat() - { - SWR_NOT_IMPL; - return 0.0; - } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UNORM5 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5> -{ - static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 31.0f; } - static float fromFloat() { return 31.0f; } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UNORM6 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6> -{ - static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 63.0f; } - static float fromFloat() { return 63.0f; } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UNORM8 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8> -{ - static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 255.0f; } - static float fromFloat() { return 255.0f; } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UNORM8 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true> -{ - static const SWR_TYPE MyType = SWR_TYPE_SNORM; - static float toFloat() { return 1.0f / 127.0f; } - static float fromFloat() { return 127.0f; } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UNORM16 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16> -{ - static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 65535.0f; } - static float fromFloat() { return 65535.0f; } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for SNORM16 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true> -{ - static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 32767.0f; } - static float fromFloat() { return 32767.0f; } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for UNORM24 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_UNORM, 24> : PackTraits<32> -{ - static const SWR_TYPE MyType = SWR_TYPE_UNORM; - static float toFloat() { return 1.0f / 16777215.0f; } - static float fromFloat() { return 16777215.0f; } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -// FLOAT Specializations from here on... -////////////////////////////////////////////////////////////////////////// -#define TO_M128i(a) _mm_castps_si128(a) -#define TO_M128(a) _mm_castsi128_ps(a) - -#include "math.h" - -template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden> -inline static __m128 fastpow(__m128 arg) -{ - __m128 ret = arg; - - static const __m128 factor = - _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) * - powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum)); - - // Apply a constant pre-correction factor. - ret = _mm_mul_ps(ret, factor); - - // Reinterpret arg as integer to obtain logarithm. - // asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret)); - ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); - - // Multiply logarithm by power. - ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden)); - - // Convert back to "integer" to exponentiate. - // asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret)); - ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); - - return ret; -} - -inline static __m128 pow512_4(__m128 arg) -{ - // 5/12 is too small, so compute the 4th root of 20/12 instead. - // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. - // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 - __m128 xf = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg); - __m128 xover = _mm_mul_ps(arg, xf); - - __m128 xfm1 = _mm_rsqrt_ps(xf); - __m128 x2 = _mm_mul_ps(arg, arg); - __m128 xunder = _mm_mul_ps(x2, xfm1); - - // sqrt2 * over + 2 * sqrt2 * under - __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), - _mm_add_ps(xover, xunder)); - - xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); - xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); - return xavg; -} - -inline static __m128 powf_wrapper(__m128 Base, float Exp) -{ - float* f = (float*)(&Base); - - return _mm_set_ps(powf(f[3], Exp), powf(f[2], Exp), powf(f[1], Exp), powf(f[0], Exp)); -} - -static inline __m128 ConvertFloatToSRGB2(__m128& Src) -{ - // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float - // value - __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src)); - - // squeeze the mask down to 16 bits (4 bits per DWORD) - int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask); - - __m128 Result; - - // - if (CompareResult == 0xFFFF) - { - // all DWORDs are <= the threshold - Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); - } - else if (CompareResult == 0x0) - { - // all DWORDs are > the threshold - __m128 fSrc_0RGB = Src; - - // --> 1.055f * c(1.0f/2.4f) - 0.055f -#if KNOB_USE_FAST_SRGB == TRUE - // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. - __m128 f = pow512_4(fSrc_0RGB); -#else - __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); -#endif - f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); - Result = _mm_sub_ps(f, _mm_set1_ps(0.055f)); - } - else - { - // some DWORDs are <= the threshold and some are > threshold - __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); - - __m128 fSrc_0RGB = Src; - - // --> 1.055f * c(1.0f/2.4f) - 0.055f -#if KNOB_USE_FAST_SRGB == TRUE - // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. - __m128 f = pow512_4(fSrc_0RGB); -#else - __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); -#endif - f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); - f = _mm_sub_ps(f, _mm_set1_ps(0.055f)); - - // Clear the alpha (is garbage after the sub) - __m128i i = _mm_and_si128(TO_M128i(f), - _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)); - - __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm)); - __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i); - __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart); - - Result = TO_M128(CombinedParts); - } - - return Result; -} - -template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden> -inline static simd16scalar SIMDCALL fastpow(simd16scalar const& value) -{ - static const float factor1 = exp2(127.0f * expden / expnum - 127.0f) * - powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum); - - // Apply a constant pre-correction factor. - simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1)); - - // Reinterpret arg as integer to obtain logarithm. - // asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result)); - result = _simd16_cvtepi32_ps(_simd16_castps_si(result)); - - // Multiply logarithm by power. - result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden)); - - // Convert back to "integer" to exponentiate. - // asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result)); - result = _simd16_castsi_ps(_simd16_cvtps_epi32(result)); - - return result; -} - -inline static simd16scalar SIMDCALL pow512_4(simd16scalar const& arg) -{ - // 5/12 is too small, so compute the 4th root of 20/12 instead. - // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. - // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 - simd16scalar xf = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg); - simd16scalar xover = _simd16_mul_ps(arg, xf); - - simd16scalar xfm1 = _simd16_rsqrt_ps(xf); - simd16scalar x2 = _simd16_mul_ps(arg, arg); - simd16scalar xunder = _simd16_mul_ps(x2, xfm1); - - // sqrt2 * over + 2 * sqrt2 * under - simd16scalar xavg = - _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), - _simd16_add_ps(xover, xunder)); - - xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg)); - xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg)); - - return xavg; -} - -inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar& base, float exp) -{ - const float* f = reinterpret_cast<const float*>(&base); - - return _simd16_set_ps(powf(f[15], exp), - powf(f[14], exp), - powf(f[13], exp), - powf(f[12], exp), - powf(f[11], exp), - powf(f[10], exp), - powf(f[9], exp), - powf(f[8], exp), - powf(f[7], exp), - powf(f[6], exp), - powf(f[5], exp), - powf(f[4], exp), - powf(f[3], exp), - powf(f[2], exp), - powf(f[1], exp), - powf(f[0], exp)); -} - -// float to SRGB conversion formula -// -// if (value < 0.0031308f) -// value *= 12.92f; -// else -// value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f; -// -static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value) -{ - // create a mask where the source is < the minimal SRGB float value - const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f)); - - // if all elements are < the threshold, result = value * 12.92 - simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f)); - - if (_simd16_mask2int(mask) != 0xFFFF) - { - // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055 -#if KNOB_USE_FAST_SRGB == TRUE - // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. - simd16scalar result2 = pow512_4(value); -#else - simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f); -#endif - - result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f)); - result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f)); - -#if (KNOB_ARCH == KNOB_ARCH_AVX512) - // only native AVX512 can directly use the computed mask for the blend operation - result = _mm512_mask_blend_ps(mask, result2, result); -#else - result = _simd16_blendv_ps( - result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f))); -#endif - } - - return result; -} - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for FLOAT16 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> -{ - static const SWR_TYPE MyType = SWR_TYPE_FLOAT; - static float toFloat() { return 1.0f; } - static float fromFloat() { return 1.0f; } - static simdscalar convertSrgb(simdscalar& in) - { - SWR_NOT_IMPL; - return _simd_setzero_ps(); - } - - static simdscalar pack(const simdscalar& in) - { -#if KNOB_SIMD_WIDTH == 8 -#if (KNOB_ARCH == KNOB_ARCH_AVX) - // input is 8 packed float32, output is 8 packed float16 - simdscalari src = _simd_castps_si(in); - - static const uint32_t FLOAT_EXP_BITS = 8; - static const uint32_t FLOAT_MANTISSA_BITS = 23; - static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1; - static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS; - - static const uint32_t HALF_EXP_BITS = 5; - static const uint32_t HALF_MANTISSA_BITS = 10; - static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS; - - // minimum exponent required, exponents below this are flushed to 0. - static const int32_t HALF_EXP_MIN = -14; - static const int32_t FLOAT_EXP_BIAS = 127; - static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS; - static const int32_t FLOAT_EXP_MIN_FTZ = - FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand - - // maximum exponent required, exponents above this are set to infinity - static const int32_t HALF_EXP_MAX = 15; - static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS; - - const simdscalari vSignMask = _simd_set1_epi32(0x80000000); - const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK); - const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK); - const simdscalari vExpMin = - _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS)); - const simdscalari vExpMinFtz = - _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS)); - const simdscalari vExpMax = - _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS)); - - simdscalari vSign = _simd_and_si(src, vSignMask); - simdscalari vExp = _simd_and_si(src, vExpMask); - simdscalari vMan = _simd_and_si(src, vManMask); - - simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz); - simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin)); - simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp); - simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp)); - - simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), - _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS)); - - // pack output 16-bits into the lower 16-bits of each 32-bit channel - simdscalari vDst = - _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK)); - vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); - - // Flush To Zero - vDst = _simd_andnot_si(vFTZMask, vDst); - // Apply Infinites / NaN - vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK))); - - // Apply clamps - vDst = _simd_andnot_si(vClampMask, vDst); - vDst = _simd_or_si(vDst, _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF))); - - // Compute Denormals (subnormals) - if (!_mm256_testz_si256(vDenormMask, vDenormMask)) - { - uint32_t* pDenormMask = (uint32_t*)&vDenormMask; - uint32_t* pExp = (uint32_t*)&vExp; - uint32_t* pMan = (uint32_t*)&vMan; - uint32_t* pDst = (uint32_t*)&vDst; - for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) - { - if (pDenormMask[i]) - { - // Need to compute subnormal value - uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS; - uint32_t mantissa = - pMan[i] | (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. - // Make it explicit - - pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + - (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); - } - } - } - - // Add in sign bits - vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16)); - - // Pack to lower 128-bits - vDst = _mm256_castsi128_si256( - _mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1))); - -#if 0 -#if !defined(NDEBUG) - simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)); - - for (uint32_t i = 0; i < 4; ++i) - { - SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]); - } -#endif -#endif - - return _simd_castsi_ps(vDst); - -#else - return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC))); -#endif -#else -#error Unsupported vector width -#endif - } - - static simdscalar unpack(const simdscalar& in) - { - // input is 8 packed float16, output is 8 packed float32 - SWR_NOT_IMPL; // @todo - return _simd_setzero_ps(); - } - - static simd16scalar pack(const simd16scalar& in) - { - simd16scalari result = _simd16_setzero_si(); - simdscalari resultlo = _simd_setzero_si(); - -#if (KNOB_ARCH == KNOB_ARCH_AVX) - simdscalar simdlo = pack(_simd16_extract_ps(in, 0)); - simdscalar simdhi = pack(_simd16_extract_ps(in, 1)); - - __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0); - __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0); - -#else - __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC); - __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC); - -#endif - resultlo = _simd_insertf128_si(resultlo, templo, 0); - resultlo = _simd_insertf128_si(resultlo, temphi, 1); - - result = _simd16_insert_si(result, resultlo, 0); - - return _simd16_castsi_ps(result); - } - - static simd16scalar unpack(const simd16scalar& in) - { - // input is 16 packed float16, output is 16 packed float32 - SWR_NOT_IMPL; // @todo - return _simd16_setzero_ps(); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// TypeTraits - Format type traits specialization for FLOAT32 -////////////////////////////////////////////////////////////////////////// -template <> -struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32> -{ - static const SWR_TYPE MyType = SWR_TYPE_FLOAT; - static float toFloat() { return 1.0f; } - static float fromFloat() { return 1.0f; } - static inline simdscalar convertSrgb(simdscalar& in) - { -#if KNOB_SIMD_WIDTH == 8 - __m128 srcLo = _mm256_extractf128_ps(in, 0); - __m128 srcHi = _mm256_extractf128_ps(in, 1); - - srcLo = ConvertFloatToSRGB2(srcLo); - srcHi = ConvertFloatToSRGB2(srcHi); - - in = _mm256_insertf128_ps(in, srcLo, 0); - in = _mm256_insertf128_ps(in, srcHi, 1); -#else -#error Unsupported vector width -#endif - return in; - } - - static inline simd16scalar convertSrgb(simd16scalar& in) { return ConvertFloatToSRGB2(in); } -}; - -////////////////////////////////////////////////////////////////////////// -/// FormatIntType - Calculate base integer type for pixel components based -/// on total number of bits. Components can be smaller -/// that this type, but the entire pixel must not be -/// any smaller than this type. -////////////////////////////////////////////////////////////////////////// -template <uint32_t bits, bool bits8 = bits <= 8, bool bits16 = bits <= 16> -struct FormatIntType -{ - typedef uint32_t TYPE; -}; - -template <uint32_t bits> -struct FormatIntType<bits, true, true> -{ - typedef uint8_t TYPE; -}; - -template <uint32_t bits> -struct FormatIntType<bits, false, true> -{ - typedef uint16_t TYPE; -}; - -////////////////////////////////////////////////////////////////////////// -/// Format1 - Bitfield for single component formats. -////////////////////////////////////////////////////////////////////////// -template <uint32_t x> -union Format1 -{ - typedef typename FormatIntType<x>::TYPE TYPE; - struct - { - TYPE r : x; - }; - - ///@ The following are here to provide full template needed in Formats. - struct - { - TYPE g : x; - }; - struct - { - TYPE b : x; - }; - struct - { - TYPE a : x; - }; -}; - -////////////////////////////////////////////////////////////////////////// -/// Format2 - Bitfield for 2 component formats. -////////////////////////////////////////////////////////////////////////// -template <uint32_t x, uint32_t y> -union Format2 -{ - typedef typename FormatIntType<x + y>::TYPE TYPE; - - struct - { - TYPE r : x; - TYPE g : y; - }; - struct - { - ///@ The following are here to provide full template needed in Formats. - TYPE b : x; - TYPE a : y; - }; -}; - -////////////////////////////////////////////////////////////////////////// -/// Format3 - Bitfield for 3 component formats. -////////////////////////////////////////////////////////////////////////// -template <uint32_t x, uint32_t y, uint32_t z> -union Format3 -{ - typedef typename FormatIntType<x + y + z>::TYPE TYPE; - - struct - { - TYPE r : x; - TYPE g : y; - TYPE b : z; - }; - TYPE a; ///@note This is here to provide full template needed in Formats. -}; - -////////////////////////////////////////////////////////////////////////// -/// Format4 - Bitfield for 4 component formats. -////////////////////////////////////////////////////////////////////////// -template <uint32_t x, uint32_t y, uint32_t z, uint32_t w> -struct Format4 -{ - typedef typename FormatIntType<x + y + z + w>::TYPE TYPE; - - TYPE r : x; - TYPE g : y; - TYPE b : z; - TYPE a : w; -}; - -////////////////////////////////////////////////////////////////////////// -/// ComponentTraits - Default components -////////////////////////////////////////////////////////////////////////// -template <uint32_t x, uint32_t y, uint32_t z, uint32_t w> -struct Defaults -{ - INLINE static uint32_t GetDefault(uint32_t comp) - { - static const uint32_t defaults[4]{x, y, z, w}; - return defaults[comp]; - } -}; - -////////////////////////////////////////////////////////////////////////// -/// ComponentTraits - Component type traits. -////////////////////////////////////////////////////////////////////////// -template <SWR_TYPE X, - uint32_t NumBitsX, - SWR_TYPE Y = SWR_TYPE_UNKNOWN, - uint32_t NumBitsY = 0, - SWR_TYPE Z = SWR_TYPE_UNKNOWN, - uint32_t NumBitsZ = 0, - SWR_TYPE W = SWR_TYPE_UNKNOWN, - uint32_t NumBitsW = 0> -struct ComponentTraits -{ - INLINE static SWR_TYPE GetType(uint32_t comp) - { - static const SWR_TYPE CompType[4]{X, Y, Z, W}; - return CompType[comp]; - } - - INLINE static constexpr uint32_t GetConstBPC(uint32_t comp) - { - return (comp == 3) ? NumBitsW - : ((comp == 2) ? NumBitsZ : ((comp == 1) ? NumBitsY : NumBitsX)); - } - - INLINE static uint32_t GetBPC(uint32_t comp) - { - static const uint32_t MyBpc[4]{NumBitsX, NumBitsY, NumBitsZ, NumBitsW}; - return MyBpc[comp]; - } - - INLINE static bool isNormalized(uint32_t comp) - { - switch (comp) - { - case 0: - return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false; - case 1: - return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false; - case 2: - return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false; - case 3: - return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false; - } - SWR_INVALID("Invalid component: %d", comp); - return false; - } - - INLINE static float toFloat(uint32_t comp) - { - switch (comp) - { - case 0: - return TypeTraits<X, NumBitsX>::toFloat(); - case 1: - return TypeTraits<Y, NumBitsY>::toFloat(); - case 2: - return TypeTraits<Z, NumBitsZ>::toFloat(); - case 3: - return TypeTraits<W, NumBitsW>::toFloat(); - } - SWR_INVALID("Invalid component: %d", comp); - return TypeTraits<X, NumBitsX>::toFloat(); - } - - INLINE static float fromFloat(uint32_t comp) - { - switch (comp) - { - case 0: - return TypeTraits<X, NumBitsX>::fromFloat(); - case 1: - return TypeTraits<Y, NumBitsY>::fromFloat(); - case 2: - return TypeTraits<Z, NumBitsZ>::fromFloat(); - case 3: - return TypeTraits<W, NumBitsW>::fromFloat(); - } - SWR_INVALID("Invalid component: %d", comp); - return TypeTraits<X, NumBitsX>::fromFloat(); - } - - INLINE static void loadSOA(uint32_t comp, const uint8_t* pSrc, simdscalar& dst) - { - switch (comp) - { - case 0: - dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc); - return; - case 1: - dst = TypeTraits<Y, NumBitsY>::loadSOA(pSrc); - return; - case 2: - dst = TypeTraits<Z, NumBitsZ>::loadSOA(pSrc); - return; - case 3: - dst = TypeTraits<W, NumBitsW>::loadSOA(pSrc); - return; - } - SWR_INVALID("Invalid component: %d", comp); - dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc); - } - - INLINE static void storeSOA(uint32_t comp, uint8_t* pDst, simdscalar const& src) - { - switch (comp) - { - case 0: - TypeTraits<X, NumBitsX>::storeSOA(pDst, src); - return; - case 1: - TypeTraits<Y, NumBitsY>::storeSOA(pDst, src); - return; - case 2: - TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src); - return; - case 3: - TypeTraits<W, NumBitsW>::storeSOA(pDst, src); - return; - } - SWR_INVALID("Invalid component: %d", comp); - } - - INLINE static simdscalar unpack(uint32_t comp, simdscalar& in) - { - simdscalar out; - switch (comp) - { - case 0: - out = TypeTraits<X, NumBitsX>::unpack(in); - break; - case 1: - out = TypeTraits<Y, NumBitsY>::unpack(in); - break; - case 2: - out = TypeTraits<Z, NumBitsZ>::unpack(in); - break; - case 3: - out = TypeTraits<W, NumBitsW>::unpack(in); - break; - default: - SWR_INVALID("Invalid component: %d", comp); - out = in; - break; - } - return out; - } - - INLINE static simdscalar pack(uint32_t comp, simdscalar& in) - { - simdscalar out; - switch (comp) - { - case 0: - out = TypeTraits<X, NumBitsX>::pack(in); - break; - case 1: - out = TypeTraits<Y, NumBitsY>::pack(in); - break; - case 2: - out = TypeTraits<Z, NumBitsZ>::pack(in); - break; - case 3: - out = TypeTraits<W, NumBitsW>::pack(in); - break; - default: - SWR_INVALID("Invalid component: %d", comp); - out = in; - break; - } - return out; - } - - INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar& in) - { - switch (comp) - { - case 0: - return TypeTraits<X, NumBitsX>::convertSrgb(in); - case 1: - return TypeTraits<Y, NumBitsY>::convertSrgb(in); - case 2: - return TypeTraits<Z, NumBitsZ>::convertSrgb(in); - case 3: - return TypeTraits<W, NumBitsW>::convertSrgb(in); - } - SWR_INVALID("Invalid component: %d", comp); - return TypeTraits<X, NumBitsX>::convertSrgb(in); - } - - INLINE static void SIMDCALL loadSOA(uint32_t comp, const uint8_t* pSrc, simd16scalar& dst) - { - switch (comp) - { - case 0: - dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); - return; - case 1: - dst = TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc); - return; - case 2: - dst = TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc); - return; - case 3: - dst = TypeTraits<W, NumBitsW>::loadSOA_16(pSrc); - return; - } - SWR_INVALID("Invalid component: %d", comp); - dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); - } - - INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t* pDst, simd16scalar const& src) - { - switch (comp) - { - case 0: - TypeTraits<X, NumBitsX>::storeSOA(pDst, src); - return; - case 1: - TypeTraits<Y, NumBitsY>::storeSOA(pDst, src); - return; - case 2: - TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src); - return; - case 3: - TypeTraits<W, NumBitsW>::storeSOA(pDst, src); - return; - } - SWR_INVALID("Invalid component: %d", comp); - TypeTraits<X, NumBitsX>::storeSOA(pDst, src); - } - - INLINE static simd16scalar unpack(uint32_t comp, simd16scalar& in) - { - switch (comp) - { - case 0: - return TypeTraits<X, NumBitsX>::unpack(in); - case 1: - return TypeTraits<Y, NumBitsY>::unpack(in); - case 2: - return TypeTraits<Z, NumBitsZ>::unpack(in); - case 3: - return TypeTraits<W, NumBitsW>::unpack(in); - } - SWR_INVALID("Invalid component: %d", comp); - return TypeTraits<X, NumBitsX>::unpack(in); - } - - INLINE static simd16scalar pack(uint32_t comp, simd16scalar& in) - { - switch (comp) - { - case 0: - return TypeTraits<X, NumBitsX>::pack(in); - case 1: - return TypeTraits<Y, NumBitsY>::pack(in); - case 2: - return TypeTraits<Z, NumBitsZ>::pack(in); - case 3: - return TypeTraits<W, NumBitsW>::pack(in); - } - SWR_INVALID("Invalid component: %d", comp); - return TypeTraits<X, NumBitsX>::pack(in); - } - - INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar& in) - { - switch (comp) - { - case 0: - return TypeTraits<X, NumBitsX>::convertSrgb(in); - case 1: - return TypeTraits<Y, NumBitsY>::convertSrgb(in); - case 2: - return TypeTraits<Z, NumBitsZ>::convertSrgb(in); - case 3: - return TypeTraits<W, NumBitsW>::convertSrgb(in); - } - SWR_INVALID("Invalid component: %d", comp); - return TypeTraits<X, NumBitsX>::convertSrgb(in); - } -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/format_utils.h b/src/gallium/drivers/swr/rasterizer/core/format_utils.h deleted file mode 100644 index 7c0b62f1910..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/format_utils.h +++ /dev/null @@ -1,939 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file utils.h - * - * @brief Utilities used by SWR core related to pixel formats. - * - ******************************************************************************/ -#pragma once - -#include "core/utils.h" -#include "common/simdintrin.h" - -INLINE -void vTranspose(simd4scalar& row0, simd4scalar& row1, simd4scalar& row2, simd4scalar& row3) -{ - simd4scalari row0i = SIMD128::castps_si(row0); - simd4scalari row1i = SIMD128::castps_si(row1); - simd4scalari row2i = SIMD128::castps_si(row2); - simd4scalari row3i = SIMD128::castps_si(row3); - - simd4scalari vTemp = row2i; - row2i = SIMD128::unpacklo_epi32(row2i, row3i); - vTemp = SIMD128::unpackhi_epi32(vTemp, row3i); - - row3i = row0i; - row0i = SIMD128::unpacklo_epi32(row0i, row1i); - row3i = SIMD128::unpackhi_epi32(row3i, row1i); - - row1i = row0i; - row0i = SIMD128::unpacklo_epi64(row0i, row2i); - row1i = SIMD128::unpackhi_epi64(row1i, row2i); - - row2i = row3i; - row2i = SIMD128::unpacklo_epi64(row2i, vTemp); - row3i = SIMD128::unpackhi_epi64(row3i, vTemp); - - row0 = SIMD128::castsi_ps(row0i); - row1 = SIMD128::castsi_ps(row1i); - row2 = SIMD128::castsi_ps(row2i); - row3 = SIMD128::castsi_ps(row3i); -} - -INLINE -void vTranspose(simd4scalari& row0, simd4scalari& row1, simd4scalari& row2, simd4scalari& row3) -{ - simd4scalari vTemp = row2; - row2 = SIMD128::unpacklo_epi32(row2, row3); - vTemp = SIMD128::unpackhi_epi32(vTemp, row3); - - row3 = row0; - row0 = SIMD128::unpacklo_epi32(row0, row1); - row3 = SIMD128::unpackhi_epi32(row3, row1); - - row1 = row0; - row0 = SIMD128::unpacklo_epi64(row0, row2); - row1 = SIMD128::unpackhi_epi64(row1, row2); - - row2 = row3; - row2 = SIMD128::unpacklo_epi64(row2, vTemp); - row3 = SIMD128::unpackhi_epi64(row3, vTemp); -} - -#if KNOB_SIMD_WIDTH == 8 -INLINE -void vTranspose3x8(simd4scalar (&vDst)[8], - const simdscalar& vSrc0, - const simdscalar& vSrc1, - const simdscalar& vSrc2) -{ - simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5 - simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); // y0w0y1w1 y4w4y5w5 - simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); // x0y0z0w0 x4y4z4w4 - simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); // x1y1z1w1 x5y5z5w5 - - r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7 - r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); // y2w2y3w3 y6w6yw77 - simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); // x2y2z2w2 x6y6z6w6 - simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); // x3y3z3w3 x7y7z7w7 - - vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0); - vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0); - vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0); - vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0); - - vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1); - vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1); - vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1); - vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); -} - -INLINE -void vTranspose4x8(simd4scalar (&vDst)[8], - const simdscalar& vSrc0, - const simdscalar& vSrc1, - const simdscalar& vSrc2, - const simdscalar& vSrc3) -{ - simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5 - simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); // y0w0y1w1 y4w4y5w5 - simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); // x0y0z0w0 x4y4z4w4 - simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); // x1y1z1w1 x5y5z5w5 - - r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7 - r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); // y2w2y3w3 y6w6yw77 - simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); // x2y2z2w2 x6y6z6w6 - simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); // x3y3z3w3 x7y7z7w7 - - vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0); - vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0); - vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0); - vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0); - - vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1); - vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1); - vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1); - vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); -} - -INLINE -void vTranspose4x16(simd16scalar (&dst)[4], - const simd16scalar& src0, - const simd16scalar& src1, - const simd16scalar& src2, - const simd16scalar& src3) -{ - const simd16scalari perm = - _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); - - // pre-permute input to setup the right order after all the unpacking - - simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r - simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g - simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b - simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a - - simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2); - simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3); - simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2); - simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3); - - dst[0] = _simd16_unpacklo_ps(rblo, galo); - dst[1] = _simd16_unpackhi_ps(rblo, galo); - dst[2] = _simd16_unpacklo_ps(rbhi, gahi); - dst[3] = _simd16_unpackhi_ps(rbhi, gahi); -} - -INLINE -void vTranspose8x8(simdscalar (&vDst)[8], - const simdscalar& vMask0, - const simdscalar& vMask1, - const simdscalar& vMask2, - const simdscalar& vMask3, - const simdscalar& vMask4, - const simdscalar& vMask5, - const simdscalar& vMask6, - const simdscalar& vMask7) -{ - simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1); - simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1); - simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3); - simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3); - simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5); - simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5); - simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7); - simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7); - simdscalar __tt0 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); - simdscalar __tt1 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); - simdscalar __tt2 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); - simdscalar __tt3 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); - simdscalar __tt4 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); - simdscalar __tt5 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); - simdscalar __tt6 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); - simdscalar __tt7 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); - vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20); - vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20); - vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20); - vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20); - vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31); - vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31); - vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31); - vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31); -} - -INLINE -void vTranspose8x8(simdscalar (&vDst)[8], - const simdscalari& vMask0, - const simdscalari& vMask1, - const simdscalari& vMask2, - const simdscalari& vMask3, - const simdscalari& vMask4, - const simdscalari& vMask5, - const simdscalari& vMask6, - const simdscalari& vMask7) -{ - vTranspose8x8(vDst, - _simd_castsi_ps(vMask0), - _simd_castsi_ps(vMask1), - _simd_castsi_ps(vMask2), - _simd_castsi_ps(vMask3), - _simd_castsi_ps(vMask4), - _simd_castsi_ps(vMask5), - _simd_castsi_ps(vMask6), - _simd_castsi_ps(vMask7)); -} -#endif - -////////////////////////////////////////////////////////////////////////// -/// TranposeSingleComponent -////////////////////////////////////////////////////////////////////////// -template <uint32_t bpp> -struct TransposeSingleComponent -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Pass-thru for single component. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { - memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); - } - - INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) - { - memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose8_8_8_8 -////////////////////////////////////////////////////////////////////////// -struct Transpose8_8_8_8 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { - simdscalari src = _simd_load_si((const simdscalari*)pSrc); - -#if KNOB_SIMD_WIDTH == 8 -#if KNOB_ARCH <= KNOB_ARCH_AVX - simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg - simd4scalari c2c3 = - SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa - simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb - simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa - simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg - simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa - simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba - simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba - SIMD128::store_si((simd4scalari*)pDst, c0123lo); - SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi); -#else - simdscalari dst01 = _simd_shuffle_epi8(src, - _simd_set_epi32(0x0f078080, - 0x0e068080, - 0x0d058080, - 0x0c048080, - 0x80800b03, - 0x80800a02, - 0x80800901, - 0x80800800)); - simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); - dst23 = _simd_shuffle_epi8(dst23, - _simd_set_epi32(0x80800f07, - 0x80800e06, - 0x80800d05, - 0x80800c04, - 0x0b038080, - 0x0a028080, - 0x09018080, - 0x08008080)); - simdscalari dst = _simd_or_si(dst01, dst23); - _simd_store_si((simdscalari*)pDst, dst); -#endif -#else -#error Unsupported vector width -#endif - } - - INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD16_WIDTH == 16 - // clang-format off - - simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr - simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg - simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb - simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa - - simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); - simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); - simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); - simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); - - simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); - simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); - simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); - - simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); - - _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba - - // clang-format on -#else -#error Unsupported vector width -#endif - } -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose8_8_8 -////////////////////////////////////////////////////////////////////////// -struct Transpose8_8_8 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose8_8 -////////////////////////////////////////////////////////////////////////// -struct Transpose8_8 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 8_8 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalari src = _simd_load_si((const simdscalari*)pSrc); - - simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg - simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg - rg = SIMD128::unpacklo_epi8(rg, g); - SIMD128::store_si((simd4scalari*)pDst, rg); -#else -#error Unsupported vector width -#endif - } - - INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD16_WIDTH == 16 - // clang-format off - - simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr - simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg - - simdscalari cvt0 = _simd_cvtepu8_epi16(src0); - simdscalari cvt1 = _simd_cvtepu8_epi16(src1); - - simdscalari shl1 = _simd_slli_epi32(cvt1, 8); - - simdscalari dst = _simd_or_si(cvt0, shl1); - - _simd_store_si(reinterpret_cast<simdscalari*>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg - - // clang-format on -#else -#error Unsupported vector width -#endif - } -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose32_32_32_32 -////////////////////////////////////////////////////////////////////////// -struct Transpose32_32_32_32 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalar src0 = _simd_load_ps((const float*)pSrc); - simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); - simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); - simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); - - simd4scalar vDst[8]; - vTranspose4x8(vDst, src0, src1, src2, src3); - SIMD128::store_ps((float*)pDst, vDst[0]); - SIMD128::store_ps((float*)pDst + 4, vDst[1]); - SIMD128::store_ps((float*)pDst + 8, vDst[2]); - SIMD128::store_ps((float*)pDst + 12, vDst[3]); - SIMD128::store_ps((float*)pDst + 16, vDst[4]); - SIMD128::store_ps((float*)pDst + 20, vDst[5]); - SIMD128::store_ps((float*)pDst + 24, vDst[6]); - SIMD128::store_ps((float*)pDst + 28, vDst[7]); -#else -#error Unsupported vector width -#endif - } - - INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD16_WIDTH == 16 - // clang-format off - - simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); - simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); - simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32); - simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 48); - - simd16scalar dst[4]; - - vTranspose4x16(dst, src0, src1, src2, src3); - - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]); - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]); - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]); - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]); - - // clang-format on -#else -#error Unsupported vector width -#endif - } -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose32_32_32 -////////////////////////////////////////////////////////////////////////// -struct Transpose32_32_32 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalar src0 = _simd_load_ps((const float*)pSrc); - simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); - simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); - - simd4scalar vDst[8]; - vTranspose3x8(vDst, src0, src1, src2); - SIMD128::store_ps((float*)pDst, vDst[0]); - SIMD128::store_ps((float*)pDst + 4, vDst[1]); - SIMD128::store_ps((float*)pDst + 8, vDst[2]); - SIMD128::store_ps((float*)pDst + 12, vDst[3]); - SIMD128::store_ps((float*)pDst + 16, vDst[4]); - SIMD128::store_ps((float*)pDst + 20, vDst[5]); - SIMD128::store_ps((float*)pDst + 24, vDst[6]); - SIMD128::store_ps((float*)pDst + 28, vDst[7]); -#else -#error Unsupported vector width -#endif - } - - INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD16_WIDTH == 16 - // clang-format off - - simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); - simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); - simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32); - simd16scalar src3 = _simd16_setzero_ps(); - - simd16scalar dst[4]; - - vTranspose4x16(dst, src0, src1, src2, src3); - - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]); - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]); - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]); - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]); - - // clang-format on -#else -#error Unsupported vector width -#endif - } -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose32_32 -////////////////////////////////////////////////////////////////////////// -struct Transpose32_32 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 32_32 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - const float* pfSrc = (const float*)pSrc; - simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0); - simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4); - simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8); - simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12); - - simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0); - simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0); - simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1); - simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1); - - float* pfDst = (float*)pDst; - SIMD128::store_ps(pfDst + 0, dst0); - SIMD128::store_ps(pfDst + 4, dst1); - SIMD128::store_ps(pfDst + 8, dst2); - SIMD128::store_ps(pfDst + 12, dst3); -#else -#error Unsupported vector width -#endif - } - - INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD16_WIDTH == 16 - // clang-format off - - simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); // rrrrrrrrrrrrrrrr - simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg - - simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD - simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF - - simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 - simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF - - simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 - simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF - - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg - - // clang-format on -#else -#error Unsupported vector width -#endif - } -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose16_16_16_16 -////////////////////////////////////////////////////////////////////////// -struct Transpose16_16_16_16 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); - simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); - - simd4scalari src_r = _simd_extractf128_si(src_rg, 0); - simd4scalari src_g = _simd_extractf128_si(src_rg, 1); - simd4scalari src_b = _simd_extractf128_si(src_ba, 0); - simd4scalari src_a = _simd_extractf128_si(src_ba, 1); - - simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g); - simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g); - simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a); - simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a); - - simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0); - simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0); - simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1); - simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1); - - SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0); - SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1); - SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2); - SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3); -#else -#error Unsupported vector width -#endif - } - - INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD16_WIDTH == 16 - // clang-format off - - simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg - simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb - simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa - - simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB - simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF - - simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 - simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB - simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD - simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF - - simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 - simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 - simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB - simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF - - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba - - // clang-format on -#else -#error Unsupported vector width -#endif - } -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose16_16_16 -////////////////////////////////////////////////////////////////////////// -struct Transpose16_16_16 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); - - simd4scalari src_r = _simd_extractf128_si(src_rg, 0); - simd4scalari src_g = _simd_extractf128_si(src_rg, 1); - simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari))); - simd4scalari src_a = SIMD128::setzero_si(); - - simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g); - simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g); - simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a); - simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a); - - simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0); - simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0); - simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1); - simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1); - - SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0); - SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1); - SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2); - SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3); -#else -#error Unsupported vector width -#endif - } - - INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD16_WIDTH == 16 - // clang-format off - - simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg - simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb - simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa - - simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB - simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF - - simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 - simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB - simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD - simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF - - simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 - simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 - simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB - simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF - - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba - - // clang-format on -#else -#error Unsupported vector width -#endif - } -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose16_16 -////////////////////////////////////////////////////////////////////////// -struct Transpose16_16 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 16_16 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalar src = _simd_load_ps((const float*)pSrc); - - simd4scalar comp0 = _simd_extractf128_ps(src, 0); - simd4scalar comp1 = _simd_extractf128_ps(src, 1); - - simd4scalari comp0i = SIMD128::castps_si(comp0); - simd4scalari comp1i = SIMD128::castps_si(comp1); - - simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i); - simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i); - - SIMD128::store_si((simd4scalari*)pDst, resLo); - SIMD128::store_si((simd4scalari*)pDst + 1, resHi); -#else -#error Unsupported vector width -#endif - } - - INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD16_WIDTH == 16 - // clang-format off - - simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg - - simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - - simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 - simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF - - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg - - // clang-format on -#else -#error Unsupported vector width -#endif - } -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose24_8 -////////////////////////////////////////////////////////////////////////// -struct Transpose24_8 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 24_8 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose32_8_24 -////////////////////////////////////////////////////////////////////////// -struct Transpose32_8_24 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose4_4_4_4 -////////////////////////////////////////////////////////////////////////// -struct Transpose4_4_4_4 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose5_6_5 -////////////////////////////////////////////////////////////////////////// -struct Transpose5_6_5 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose9_9_9_5 -////////////////////////////////////////////////////////////////////////// -struct Transpose9_9_9_5 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose5_5_5_1 -////////////////////////////////////////////////////////////////////////// -struct Transpose5_5_5_1 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose1_5_5_5 -////////////////////////////////////////////////////////////////////////// -struct Transpose1_5_5_5 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose10_10_10_2 -////////////////////////////////////////////////////////////////////////// -struct Transpose10_10_10_2 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose11_11_10 -////////////////////////////////////////////////////////////////////////// -struct Transpose11_11_10 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose64 -////////////////////////////////////////////////////////////////////////// -struct Transpose64 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose64_64 -////////////////////////////////////////////////////////////////////////// -struct Transpose64_64 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose64_64_64 -////////////////////////////////////////////////////////////////////////// -struct Transpose64_64_64 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose64_64_64_64 -////////////////////////////////////////////////////////////////////////// -struct Transpose64_64_64_64 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; - static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp deleted file mode 100644 index 50ea12e0510..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ /dev/null @@ -1,2385 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file frontend.cpp - * - * @brief Implementation for Frontend which handles vertex processing, - * primitive assembly, clipping, binning, etc. - * - ******************************************************************************/ - -#include "api.h" -#include "frontend.h" -#include "backend.h" -#include "context.h" -#include "rdtsc_core.h" -#include "utils.h" -#include "threads.h" -#include "pa.h" -#include "clip.h" -#include "tilemgr.h" -#include "tessellator.h" -#include <limits> -#include <iostream> - -////////////////////////////////////////////////////////////////////////// -/// @brief FE handler for SwrSync. -/// @param pContext - pointer to SWR context. -/// @param pDC - pointer to draw context. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param pUserData - Pointer to user data passed back to sync callback. -/// @todo This should go away when we switch this to use compute threading. -void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData) -{ - BE_WORK work; - work.type = SYNC; - work.pfnWork = ProcessSyncBE; - - MacroTileMgr* pTileMgr = pDC->pTileMgr; - pTileMgr->enqueue(0, 0, &work); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief FE handler for SwrDestroyContext. -/// @param pContext - pointer to SWR context. -/// @param pDC - pointer to draw context. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param pUserData - Pointer to user data passed back to sync callback. -void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData) -{ - BE_WORK work; - work.type = SHUTDOWN; - work.pfnWork = ProcessShutdownBE; - - MacroTileMgr* pTileMgr = pDC->pTileMgr; - // Enqueue at least 1 work item for each worker thread - // account for number of numa nodes - uint32_t numNumaNodes = pContext->threadPool.numaMask + 1; - - for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i) - { - for (uint32_t n = 0; n < numNumaNodes; ++n) - { - pTileMgr->enqueue(i, n, &work); - } - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief FE handler for SwrClearRenderTarget. -/// @param pContext - pointer to SWR context. -/// @param pDC - pointer to draw context. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param pUserData - Pointer to user data passed back to clear callback. -/// @todo This should go away when we switch this to use compute threading. -void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData) -{ - CLEAR_DESC* pDesc = (CLEAR_DESC*)pUserData; - MacroTileMgr* pTileMgr = pDC->pTileMgr; - - // queue a clear to each macro tile - // compute macro tile bounds for the specified rect - uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; - uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; - uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; - uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; - - BE_WORK work; - work.type = CLEAR; - work.pfnWork = ProcessClearBE; - work.desc.clear = *pDesc; - - for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) - { - for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) - { - pTileMgr->enqueue(x, y, &work); - } - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief FE handler for SwrStoreTiles. -/// @param pContext - pointer to SWR context. -/// @param pDC - pointer to draw context. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param pUserData - Pointer to user data passed back to callback. -/// @todo This should go away when we switch this to use compute threading. -void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData) -{ - RDTSC_BEGIN(pContext->pBucketMgr, FEProcessStoreTiles, pDC->drawId); - MacroTileMgr* pTileMgr = pDC->pTileMgr; - STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData; - - // queue a store to each macro tile - // compute macro tile bounds for the specified rect - uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; - uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; - uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; - uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; - - // store tiles - BE_WORK work; - work.type = STORETILES; - work.pfnWork = ProcessStoreTilesBE; - work.desc.storeTiles = *pDesc; - - for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) - { - for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) - { - pTileMgr->enqueue(x, y, &work); - } - } - - RDTSC_END(pContext->pBucketMgr, FEProcessStoreTiles, 0); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief FE handler for SwrInvalidateTiles. -/// @param pContext - pointer to SWR context. -/// @param pDC - pointer to draw context. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param pUserData - Pointer to user data passed back to callback. -/// @todo This should go away when we switch this to use compute threading. -void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext, - DRAW_CONTEXT* pDC, - uint32_t workerId, - void* pUserData) -{ - RDTSC_BEGIN(pContext->pBucketMgr, FEProcessInvalidateTiles, pDC->drawId); - DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData; - MacroTileMgr* pTileMgr = pDC->pTileMgr; - - // compute macro tile bounds for the specified rect - uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM; - uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1; - uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM; - uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1; - - if (pDesc->fullTilesOnly == false) - { - // include partial tiles - macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; - macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; - macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; - macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; - } - - SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X); - SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y); - - macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X); - macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y); - - // load tiles - BE_WORK work; - work.type = DISCARDINVALIDATETILES; - work.pfnWork = ProcessDiscardInvalidateTilesBE; - work.desc.discardInvalidateTiles = *pDesc; - - for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) - { - for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) - { - pTileMgr->enqueue(x, y, &work); - } - } - - RDTSC_END(pContext->pBucketMgr, FEProcessInvalidateTiles, 0); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes the number of primitives given the number of verts. -/// @param mode - primitive topology for draw operation. -/// @param numPrims - number of vertices or indices for draw. -/// @todo Frontend needs to be refactored. This will go in appropriate place then. -uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims) -{ - switch (mode) - { - case TOP_POINT_LIST: - return numPrims; - case TOP_TRIANGLE_LIST: - return numPrims / 3; - case TOP_TRIANGLE_STRIP: - return numPrims < 3 ? 0 : numPrims - 2; - case TOP_TRIANGLE_FAN: - return numPrims < 3 ? 0 : numPrims - 2; - case TOP_TRIANGLE_DISC: - return numPrims < 2 ? 0 : numPrims - 1; - case TOP_QUAD_LIST: - return numPrims / 4; - case TOP_QUAD_STRIP: - return numPrims < 4 ? 0 : (numPrims - 2) / 2; - case TOP_LINE_STRIP: - return numPrims < 2 ? 0 : numPrims - 1; - case TOP_LINE_LIST: - return numPrims / 2; - case TOP_LINE_LOOP: - return numPrims; - case TOP_RECT_LIST: - return numPrims / 3; - case TOP_LINE_LIST_ADJ: - return numPrims / 4; - case TOP_LISTSTRIP_ADJ: - return numPrims < 3 ? 0 : numPrims - 3; - case TOP_TRI_LIST_ADJ: - return numPrims / 6; - case TOP_TRI_STRIP_ADJ: - return numPrims < 4 ? 0 : (numPrims / 2) - 2; - - case TOP_PATCHLIST_1: - case TOP_PATCHLIST_2: - case TOP_PATCHLIST_3: - case TOP_PATCHLIST_4: - case TOP_PATCHLIST_5: - case TOP_PATCHLIST_6: - case TOP_PATCHLIST_7: - case TOP_PATCHLIST_8: - case TOP_PATCHLIST_9: - case TOP_PATCHLIST_10: - case TOP_PATCHLIST_11: - case TOP_PATCHLIST_12: - case TOP_PATCHLIST_13: - case TOP_PATCHLIST_14: - case TOP_PATCHLIST_15: - case TOP_PATCHLIST_16: - case TOP_PATCHLIST_17: - case TOP_PATCHLIST_18: - case TOP_PATCHLIST_19: - case TOP_PATCHLIST_20: - case TOP_PATCHLIST_21: - case TOP_PATCHLIST_22: - case TOP_PATCHLIST_23: - case TOP_PATCHLIST_24: - case TOP_PATCHLIST_25: - case TOP_PATCHLIST_26: - case TOP_PATCHLIST_27: - case TOP_PATCHLIST_28: - case TOP_PATCHLIST_29: - case TOP_PATCHLIST_30: - case TOP_PATCHLIST_31: - case TOP_PATCHLIST_32: - return numPrims / (mode - TOP_PATCHLIST_BASE); - - case TOP_POLYGON: - case TOP_POINT_LIST_BF: - case TOP_LINE_STRIP_CONT: - case TOP_LINE_STRIP_BF: - case TOP_LINE_STRIP_CONT_BF: - case TOP_TRIANGLE_FAN_NOSTIPPLE: - case TOP_TRI_STRIP_REVERSE: - case TOP_PATCHLIST_BASE: - case TOP_UNKNOWN: - SWR_INVALID("Unsupported topology: %d", mode); - return 0; - } - - return 0; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes the number of verts given the number of primitives. -/// @param mode - primitive topology for draw operation. -/// @param numPrims - number of primitives for draw. -uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims) -{ - switch (mode) - { - case TOP_POINT_LIST: - return numPrims; - case TOP_TRIANGLE_LIST: - return numPrims * 3; - case TOP_TRIANGLE_STRIP: - return numPrims ? numPrims + 2 : 0; - case TOP_TRIANGLE_FAN: - return numPrims ? numPrims + 2 : 0; - case TOP_TRIANGLE_DISC: - return numPrims ? numPrims + 1 : 0; - case TOP_QUAD_LIST: - return numPrims * 4; - case TOP_QUAD_STRIP: - return numPrims ? numPrims * 2 + 2 : 0; - case TOP_LINE_STRIP: - return numPrims ? numPrims + 1 : 0; - case TOP_LINE_LIST: - return numPrims * 2; - case TOP_LINE_LOOP: - return numPrims; - case TOP_RECT_LIST: - return numPrims * 3; - case TOP_LINE_LIST_ADJ: - return numPrims * 4; - case TOP_LISTSTRIP_ADJ: - return numPrims ? numPrims + 3 : 0; - case TOP_TRI_LIST_ADJ: - return numPrims * 6; - case TOP_TRI_STRIP_ADJ: - return numPrims ? (numPrims + 2) * 2 : 0; - - case TOP_PATCHLIST_1: - case TOP_PATCHLIST_2: - case TOP_PATCHLIST_3: - case TOP_PATCHLIST_4: - case TOP_PATCHLIST_5: - case TOP_PATCHLIST_6: - case TOP_PATCHLIST_7: - case TOP_PATCHLIST_8: - case TOP_PATCHLIST_9: - case TOP_PATCHLIST_10: - case TOP_PATCHLIST_11: - case TOP_PATCHLIST_12: - case TOP_PATCHLIST_13: - case TOP_PATCHLIST_14: - case TOP_PATCHLIST_15: - case TOP_PATCHLIST_16: - case TOP_PATCHLIST_17: - case TOP_PATCHLIST_18: - case TOP_PATCHLIST_19: - case TOP_PATCHLIST_20: - case TOP_PATCHLIST_21: - case TOP_PATCHLIST_22: - case TOP_PATCHLIST_23: - case TOP_PATCHLIST_24: - case TOP_PATCHLIST_25: - case TOP_PATCHLIST_26: - case TOP_PATCHLIST_27: - case TOP_PATCHLIST_28: - case TOP_PATCHLIST_29: - case TOP_PATCHLIST_30: - case TOP_PATCHLIST_31: - case TOP_PATCHLIST_32: - return numPrims * (mode - TOP_PATCHLIST_BASE); - - case TOP_POLYGON: - case TOP_POINT_LIST_BF: - case TOP_LINE_STRIP_CONT: - case TOP_LINE_STRIP_BF: - case TOP_LINE_STRIP_CONT_BF: - case TOP_TRIANGLE_FAN_NOSTIPPLE: - case TOP_TRI_STRIP_REVERSE: - case TOP_PATCHLIST_BASE: - case TOP_UNKNOWN: - SWR_INVALID("Unsupported topology: %d", mode); - return 0; - } - - return 0; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Return number of verts per primitive. -/// @param topology - topology -/// @param includeAdjVerts - include adjacent verts in primitive vertices -uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts) -{ - uint32_t numVerts = 0; - switch (topology) - { - case TOP_POINT_LIST: - case TOP_POINT_LIST_BF: - numVerts = 1; - break; - case TOP_LINE_LIST: - case TOP_LINE_STRIP: - case TOP_LINE_LIST_ADJ: - case TOP_LINE_LOOP: - case TOP_LINE_STRIP_CONT: - case TOP_LINE_STRIP_BF: - case TOP_LISTSTRIP_ADJ: - numVerts = 2; - break; - case TOP_TRIANGLE_LIST: - case TOP_TRIANGLE_STRIP: - case TOP_TRIANGLE_FAN: - case TOP_TRI_LIST_ADJ: - case TOP_TRI_STRIP_ADJ: - case TOP_TRI_STRIP_REVERSE: - case TOP_RECT_LIST: - numVerts = 3; - break; - case TOP_QUAD_LIST: - case TOP_QUAD_STRIP: - numVerts = 4; - break; - case TOP_PATCHLIST_1: - case TOP_PATCHLIST_2: - case TOP_PATCHLIST_3: - case TOP_PATCHLIST_4: - case TOP_PATCHLIST_5: - case TOP_PATCHLIST_6: - case TOP_PATCHLIST_7: - case TOP_PATCHLIST_8: - case TOP_PATCHLIST_9: - case TOP_PATCHLIST_10: - case TOP_PATCHLIST_11: - case TOP_PATCHLIST_12: - case TOP_PATCHLIST_13: - case TOP_PATCHLIST_14: - case TOP_PATCHLIST_15: - case TOP_PATCHLIST_16: - case TOP_PATCHLIST_17: - case TOP_PATCHLIST_18: - case TOP_PATCHLIST_19: - case TOP_PATCHLIST_20: - case TOP_PATCHLIST_21: - case TOP_PATCHLIST_22: - case TOP_PATCHLIST_23: - case TOP_PATCHLIST_24: - case TOP_PATCHLIST_25: - case TOP_PATCHLIST_26: - case TOP_PATCHLIST_27: - case TOP_PATCHLIST_28: - case TOP_PATCHLIST_29: - case TOP_PATCHLIST_30: - case TOP_PATCHLIST_31: - case TOP_PATCHLIST_32: - numVerts = topology - TOP_PATCHLIST_BASE; - break; - default: - SWR_INVALID("Unsupported topology: %d", topology); - break; - } - - if (includeAdjVerts) - { - switch (topology) - { - case TOP_LISTSTRIP_ADJ: - case TOP_LINE_LIST_ADJ: - numVerts = 4; - break; - case TOP_TRI_STRIP_ADJ: - case TOP_TRI_LIST_ADJ: - numVerts = 6; - break; - default: - break; - } - } - - return numVerts; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Generate mask from remaining work. -/// @param numWorkItems - Number of items being worked on by a SIMD. -static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) -{ - uint32_t numActive = - (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining; - uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; - return _simd_castps_si(_simd_vmask_ps(mask)); -} - -static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining) -{ - uint32_t numActive = - (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining; - uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; - return _simd16_castps_si(_simd16_vmask_ps(mask)); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief StreamOut - Streams vertex data out to SO buffers. -/// Generally, we are only streaming out a SIMDs worth of triangles. -/// @param pDC - pointer to draw context. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris) -static void StreamOut( - DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId); - - void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - const API_STATE& state = GetApiState(pDC); - const SWR_STREAMOUT_STATE& soState = state.soState; - - uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); - - // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each - // vertex. - uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t); - - SWR_STREAMOUT_CONTEXT soContext = {0}; - - // Setup buffer state pointers. - for (uint32_t i = 0; i < 4; ++i) - { - soContext.pBuffer[i] = &state.soBuffer[i]; - } - - uint32_t numPrims = pa.NumPrims(); - - for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex) - { - unsigned long slot = 0; - uint64_t soMask = soState.streamMasks[streamIndex]; - - // Write all entries into primitive data buffer for SOS. - while (_BitScanForward64(&slot, soMask)) - { - simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide) - uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex]; - pa.AssembleSingle(paSlot, primIndex, attrib); - - // Attribute offset is relative offset from start of vertex. - // Note that attributes start at slot 1 in the PA buffer. We need to write this - // to prim data starting at slot 0. Which is why we do (slot - 1). - // Also note: GL works slightly differently, and needs slot 0 - uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t); - - // Store each vertex's attrib at appropriate locations in pPrimData buffer. - for (uint32_t v = 0; v < soVertsPerPrim; ++v) - { - uint32_t* pPrimDataAttrib = - pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride); - - _mm_store_ps((float*)pPrimDataAttrib, attrib[v]); - } - - soMask &= ~(uint64_t(1) << slot); - } - - // Update pPrimData pointer - soContext.pPrimData = pPrimData; - - // Call SOS - SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, - "Trying to execute uninitialized streamout jit function."); - state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext); - } - - // Update SO write offset. The driver provides memory for the update. - for (uint32_t i = 0; i < 4; ++i) - { - if (state.soBuffer[i].pWriteOffset) - { - bool nullTileAccessed = false; - void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite( - GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData); - *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); - } - - if (state.soBuffer[i].soWriteEnable) - { - pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); - pDC->dynState.SoWriteOffsetDirty[i] = true; - } - } - - pDC->dynState.soPrims += soContext.numPrimsWritten; - - UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded); - UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten); - - RDTSC_END(pDC->pContext->pBucketMgr, FEStreamout, 1); -} - -#if USE_SIMD16_FRONTEND -////////////////////////////////////////////////////////////////////////// -/// Is value an even number (a multiple of two) -/// -template <typename T> -INLINE static bool IsEven(T value) -{ - return (value & 1) == 0; -} - -////////////////////////////////////////////////////////////////////////// -/// Round up value to an even number (a multiple of two) -/// -template <typename T> -INLINE static T RoundUpEven(T value) -{ - return (value + 1) & ~1; -} - -////////////////////////////////////////////////////////////////////////// -/// Round down value to an even number (a multiple of two) -/// -template <typename T> -INLINE static T RoundDownEven(T value) -{ - return value & ~1; -} - -////////////////////////////////////////////////////////////////////////// -/// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping -/// -/// vertexCount is in terms of the source simdvertexes and must be even -/// -/// attribCount will limit the vector copies to those attribs specified -/// -/// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS -/// -void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex* vertex_simd16, - const simdvertex* vertex, - uint32_t vertexCount, - uint32_t attribCount) -{ - SWR_ASSERT(vertex); - SWR_ASSERT(vertex_simd16); - SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS); - - simd16vertex temp; - - for (uint32_t i = 0; i < vertexCount; i += 2) - { - for (uint32_t j = 0; j < attribCount; j += 1) - { - for (uint32_t k = 0; k < 4; k += 1) - { - temp.attrib[j][k] = - _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0); - - if ((i + 1) < vertexCount) - { - temp.attrib[j][k] = - _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1); - } - } - } - - for (uint32_t j = 0; j < attribCount; j += 1) - { - vertex_simd16[i >> 1].attrib[j] = temp.attrib[j]; - } - } -} - -#endif -////////////////////////////////////////////////////////////////////////// -/// @brief Computes number of invocations. The current index represents -/// the start of the SIMD. The max index represents how much work -/// items are remaining. If there is less then a SIMD's xmin of work -/// then return the remaining amount of work. -/// @param curIndex - The start index for the SIMD. -/// @param maxIndex - The last index for all work items. -static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex) -{ - uint32_t remainder = (maxIndex - curIndex); -#if USE_SIMD16_FRONTEND - return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder; -#else - return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder; -#endif -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Converts a streamId buffer to a cut buffer for the given stream id. -/// The geometry shader will loop over each active streamout buffer, assembling -/// primitives for the downstream stages. When multistream output is enabled, -/// the generated stream ID buffer from the GS needs to be converted to a cut -/// buffer for the primitive assembler. -/// @param stream - stream id to generate the cut buffer for -/// @param pStreamIdBase - pointer to the stream ID buffer -/// @param numEmittedVerts - Number of total verts emitted by the GS -/// @param pCutBuffer - output buffer to write cuts to -void ProcessStreamIdBuffer(uint32_t stream, - uint8_t* pStreamIdBase, - uint32_t numEmittedVerts, - uint8_t* pCutBuffer) -{ - SWR_ASSERT(stream < MAX_SO_STREAMS); - - uint32_t numOutputBytes = AlignUp(numEmittedVerts, 8) / 8; - - for (uint32_t b = 0; b < numOutputBytes; ++b) - { - uint8_t curInputByte = pStreamIdBase[2 * b]; - uint8_t outByte = 0; - for (uint32_t i = 0; i < 4; ++i) - { - if ((curInputByte & 0x3) != stream) - { - outByte |= (1 << i); - } - curInputByte >>= 2; - } - - curInputByte = pStreamIdBase[2 * b + 1]; - for (uint32_t i = 0; i < 4; ++i) - { - if ((curInputByte & 0x3) != stream) - { - outByte |= (1 << (i + 4)); - } - curInputByte >>= 2; - } - - *pCutBuffer++ = outByte; - } -} - -// Buffers that are allocated if GS is enabled -struct GsBuffers -{ - uint8_t* pGsIn; - uint8_t* pGsOut[KNOB_SIMD_WIDTH]; - uint8_t* pGsTransposed; - void* pStreamCutBuffer; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler -/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive -/// assembler -/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader -/// @param numVerts - Number of vertices outputted by the GS -/// @param numAttribs - Number of attributes per vertex -template <typename SIMD_T, uint32_t SimdWidth> -void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs) -{ - uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; - uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4; - - OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth]; - - for (uint32_t i = 0; i < SimdWidth; ++i) - { - gatherOffsets[i] = srcVertexStride * i; - } - auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]); - - uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; - uint32_t remainingVerts = numVerts; - - for (uint32_t s = 0; s < numSimd; ++s) - { - uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth; - uint8_t* pDstBase = pDst + s * dstVertexStride; - - // Compute mask to prevent src overflow - uint32_t mask = std::min(remainingVerts, SimdWidth); - mask = GenMask(mask); - auto vMask = SIMD_T::vmask_ps(mask); - auto viMask = SIMD_T::castps_si(vMask); - - for (uint32_t a = 0; a < numAttribs; ++a) - { - auto attribGatherX = SIMD_T::mask_i32gather_ps( - SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); - auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(), - (const float*)(pSrcBase + sizeof(float)), - vGatherOffsets, - vMask); - auto attribGatherZ = - SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(), - (const float*)(pSrcBase + sizeof(float) * 2), - vGatherOffsets, - vMask); - auto attribGatherW = - SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(), - (const float*)(pSrcBase + sizeof(float) * 3), - vGatherOffsets, - vMask); - - SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX); - SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY); - SIMD_T::maskstore_ps( - (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ); - SIMD_T::maskstore_ps( - (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW); - - pSrcBase += sizeof(float) * 4; - pDstBase += sizeof(Float<SIMD_T>) * 4; - } - remainingVerts -= SimdWidth; - } -} - - -////////////////////////////////////////////////////////////////////////// -/// @brief Implements GS stage. -/// @param pDC - pointer to draw context. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param pa - The primitive assembly object. -/// @param pGsOut - output stream for GS -template <typename HasStreamOutT, typename HasRastT> -static void GeometryShaderStage(DRAW_CONTEXT* pDC, - uint32_t workerId, - PA_STATE& pa, - GsBuffers* pGsBuffers, - uint32_t* pSoPrimData, -#if USE_SIMD16_FRONTEND - uint32_t numPrims_simd8, -#endif - simdscalari const& primID) -{ - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEGeometryShader, pDC->drawId); - - void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - const API_STATE& state = GetApiState(pDC); - const SWR_GS_STATE* pState = &state.gsState; - SWR_GS_CONTEXT gsContext; - - static uint8_t sNullBuffer[128] = {0}; - - for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) - { - gsContext.pStreams[i] = pGsBuffers->pGsOut[i]; - } - gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn; - gsContext.PrimitiveID = primID; - - uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); - simdvector attrib[MAX_NUM_VERTS_PER_PRIM]; - - // assemble all attributes for the input primitive - gsContext.inputVertStride = pState->inputVertStride; - for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) - { - uint32_t attribOffset = slot + pState->vertexAttribOffset; - pa.Assemble(attribOffset, attrib); - - for (uint32_t i = 0; i < numVertsPerPrim; ++i) - { - gsContext.pVerts[attribOffset + pState->inputVertStride * i] = attrib[i]; - } - } - - // record valid prims from the frontend to avoid over binning the newly generated - // prims from the GS -#if USE_SIMD16_FRONTEND - uint32_t numInputPrims = numPrims_simd8; -#else - uint32_t numInputPrims = pa.NumPrims(); -#endif - - for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) - { - gsContext.InstanceID = instance; - gsContext.mask = GenerateMask(numInputPrims); - - // execute the geometry shader - state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext); - AR_EVENT(GSStats((HANDLE)&gsContext.stats)); - - for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) - { - gsContext.pStreams[i] += pState->allocationSize; - } - } - - // set up new binner and state for the GS output topology -#if USE_SIMD16_FRONTEND - PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr; - if (HasRastT::value) - { - switch (pState->outputTopology) - { - case TOP_RECT_LIST: - pfnClipFunc = ClipRectangles_simd16; - break; - case TOP_TRIANGLE_STRIP: - pfnClipFunc = ClipTriangles_simd16; - break; - case TOP_LINE_STRIP: - pfnClipFunc = ClipLines_simd16; - break; - case TOP_POINT_LIST: - pfnClipFunc = ClipPoints_simd16; - break; - default: - SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology); - } - } - -#else - PFN_PROCESS_PRIMS pfnClipFunc = nullptr; - if (HasRastT::value) - { - switch (pState->outputTopology) - { - case TOP_RECT_LIST: - pfnClipFunc = ClipRectangles; - break; - case TOP_TRIANGLE_STRIP: - pfnClipFunc = ClipTriangles; - break; - case TOP_LINE_STRIP: - pfnClipFunc = ClipLines; - break; - case TOP_POINT_LIST: - pfnClipFunc = ClipPoints; - break; - default: - SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology); - } - } - -#endif - // foreach input prim: - // - setup a new PA based on the emitted verts for that prim - // - loop over the new verts, calling PA to assemble each prim - uint32_t* pPrimitiveId = (uint32_t*)&primID; - - uint32_t totalPrimsGenerated = 0; - for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) - { - uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim]; - - // Vertex count is either emitted by shader or static - uint32_t vertexCount = 0; - if (pState->staticVertexCount) - { - vertexCount = pState->staticVertexCount; - } - else - { - // If emitted in shader, it should be the stored in the first dword of the output buffer - vertexCount = *(uint32_t*)pInstanceBase; - } - - for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) - { - uint32_t numEmittedVerts = vertexCount; - if (numEmittedVerts == 0) - { - continue; - } - - uint8_t* pBase = pInstanceBase + instance * pState->allocationSize; - uint8_t* pCutBase = - pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset; - uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset; - -#if USE_SIMD16_FRONTEND - TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, - pVertexBaseAOS, - vertexCount, - pState->outputVertexSize); -#else - TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, - pVertexBaseAOS, - vertexCount, - pState->outputVertexSize); -#endif - - uint32_t numAttribs = state.feNumAttributes; - - for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) - { - bool processCutVerts = false; - uint8_t* pCutBuffer = pCutBase; - - // assign default stream ID, only relevant when GS is outputting a single stream - uint32_t streamID = 0; - if (pState->isSingleStream) - { - processCutVerts = true; - streamID = pState->singleStreamID; - if (streamID != stream) - continue; - } - else - { - // early exit if this stream is not enabled for streamout - if (HasStreamOutT::value && !state.soState.streamEnable[stream]) - { - continue; - } - - // multi-stream output, need to translate StreamID buffer to a cut buffer - ProcessStreamIdBuffer( - stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer); - pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer; - processCutVerts = false; - } - -#if USE_SIMD16_FRONTEND - PA_STATE_CUT gsPa(pDC, - (uint8_t*)pGsBuffers->pGsTransposed, - numEmittedVerts, - pState->outputVertexSize, - reinterpret_cast<simd16mask*>(pCutBuffer), - numEmittedVerts, - numAttribs, - pState->outputTopology, - processCutVerts, - pa.numVertsPerPrim); - -#else - PA_STATE_CUT gsPa(pDC, - (uint8_t*)pGsBuffers->pGsTransposed, - numEmittedVerts, - pState->outputVertexSize, - pCutBuffer, - numEmittedVerts, - numAttribs, - pState->outputTopology, - processCutVerts, - pa.numVertsPerPrim); - -#endif - while (gsPa.GetNextStreamOutput()) - { - do - { -#if USE_SIMD16_FRONTEND - simd16vector attrib_simd16[3]; - - bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16); - -#else - bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib); - -#endif - if (assemble) - { - totalPrimsGenerated += gsPa.NumPrims(); - - if (HasStreamOutT::value) - { -#if ENABLE_AVX512_SIMD16 - gsPa.useAlternateOffset = false; -#endif - StreamOut(pDC, gsPa, workerId, pSoPrimData, stream); - } - - if (HasRastT::value && state.soState.streamToRasterizer == stream) - { -#if USE_SIMD16_FRONTEND - simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]); - - // Gather data from the SVG if provided. - simd16scalari vViewportIdx = SIMD16::setzero_si(); - simd16scalari vRtIdx = SIMD16::setzero_si(); - SIMD16::Vec4 svgAttrib[4]; - - if (state.backendState.readViewportArrayIndex || - state.backendState.readRenderTargetArrayIndex) - { - gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); - } - - if (state.backendState.readViewportArrayIndex) - { - vViewportIdx = - SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); - gsPa.viewportArrayActive = true; - } - if (state.backendState.readRenderTargetArrayIndex) - { - vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); - gsPa.rtArrayActive = true; - } - - { - // OOB VPAI indices => forced to zero. - vViewportIdx = - SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si()); - simd16scalari vNumViewports = - SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simd16scalari vClearMask = - SIMD16::cmplt_epi32(vViewportIdx, vNumViewports); - vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx); - - gsPa.useAlternateOffset = false; - pfnClipFunc(pDC, - gsPa, - workerId, - attrib_simd16, - GenMask(gsPa.NumPrims()), - vPrimId, - vViewportIdx, - vRtIdx); - } -#else - simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); - - // Gather data from the SVG if provided. - simdscalari vViewportIdx = SIMD::setzero_si(); - simdscalari vRtIdx = SIMD::setzero_si(); - SIMD::Vec4 svgAttrib[4]; - - if (state.backendState.readViewportArrayIndex || - state.backendState.readRenderTargetArrayIndex) - { - gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); - } - - if (state.backendState.readViewportArrayIndex) - { - vViewportIdx = - SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); - - // OOB VPAI indices => forced to zero. - vViewportIdx = - SIMD::max_epi32(vViewportIdx, SIMD::setzero_si()); - simdscalari vNumViewports = - SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simdscalari vClearMask = - SIMD::cmplt_epi32(vViewportIdx, vNumViewports); - vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx); - gsPa.viewportArrayActive = true; - } - if (state.backendState.readRenderTargetArrayIndex) - { - vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); - gsPa.rtArrayActive = true; - } - - pfnClipFunc(pDC, - gsPa, - workerId, - attrib, - GenMask(gsPa.NumPrims()), - vPrimId, - vViewportIdx, - vRtIdx); -#endif - } - } - } while (gsPa.NextPrim()); - } - } - } - } - - // update GS pipeline stats - UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount); - UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated); - AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims)); - RDTSC_END(pDC->pContext->pBucketMgr, FEGeometryShader, 1); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Allocate GS buffers -/// @param pDC - pointer to draw context. -/// @param state - API state -/// @param ppGsOut - pointer to GS output buffer allocation -/// @param ppCutBuffer - pointer to GS output cut buffer allocation -template <typename SIMD_T, uint32_t SIMD_WIDTH> -static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, - const API_STATE& state, - uint32_t vertsPerPrim, - GsBuffers* pGsBuffers) -{ - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - SWR_ASSERT(state.gsState.gsEnable); - - const SWR_GS_STATE& gsState = state.gsState; - - // Allocate storage for vertex inputs - uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim; - pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32); - - // Allocate arena space to hold GS output verts - const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize; - - for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) - { - pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32); - } - - // Allocate storage for transposed GS output - uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH; - uint32_t transposedBufferSize = - numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>); - pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32); - - // Allocate storage to hold temporary stream->cut buffer, if necessary - if (state.gsState.isSingleStream) - { - pGsBuffers->pStreamCutBuffer = nullptr; - } - else - { - pGsBuffers->pStreamCutBuffer = - (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Contains all data generated by the HS and passed to the -/// tessellator and DS. -struct TessellationThreadLocalData -{ - SWR_HS_CONTEXT hsContext; - void* pTxCtx; - size_t tsCtxSize; - - uint8_t* pHSOutput; - size_t hsOutputAllocSize; - - simdscalar* pDSOutput; - size_t dsOutputAllocSize; -}; - -THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr; - -////////////////////////////////////////////////////////////////////////// -/// @brief Allocate tessellation data for this worker thread. -INLINE -static void AllocateTessellationData(SWR_CONTEXT* pContext) -{ - /// @TODO - Don't use thread local storage. Use Worker local storage instead. - if (gt_pTessellationThreadData == nullptr) - { - gt_pTessellationThreadData = - (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64); - memset((void*)gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData)); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Implements Tessellation Stages. -/// @param pDC - pointer to draw context. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param pa - The primitive assembly object. -/// @param pGsOut - output stream for GS -template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT> -static void TessellationStages(DRAW_CONTEXT* pDC, - uint32_t workerId, - PA_STATE& pa, - GsBuffers* pGsBuffers, - uint32_t* pSoPrimData, -#if USE_SIMD16_FRONTEND - uint32_t numPrims_simd8, -#endif - simdscalari const& primID) -{ - const API_STATE& state = GetApiState(pDC); - const SWR_TS_STATE& tsState = state.tsState; - void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - SWR_ASSERT(gt_pTessellationThreadData); - - HANDLE tsCtx = TSInitCtx(tsState.domain, - tsState.partitioning, - tsState.tsOutputTopology, - gt_pTessellationThreadData->pTxCtx, - gt_pTessellationThreadData->tsCtxSize); - if (tsCtx == nullptr) - { - gt_pTessellationThreadData->pTxCtx = - AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64); - tsCtx = TSInitCtx(tsState.domain, - tsState.partitioning, - tsState.tsOutputTopology, - gt_pTessellationThreadData->pTxCtx, - gt_pTessellationThreadData->tsCtxSize); - } - SWR_ASSERT(tsCtx); - -#if USE_SIMD16_FRONTEND - PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr; - if (HasRastT::value) - { - switch (tsState.postDSTopology) - { - case TOP_TRIANGLE_LIST: - pfnClipFunc = ClipTriangles_simd16; - break; - case TOP_LINE_LIST: - pfnClipFunc = ClipLines_simd16; - break; - case TOP_POINT_LIST: - pfnClipFunc = ClipPoints_simd16; - break; - default: - SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology); - } - } - -#else - PFN_PROCESS_PRIMS pfnClipFunc = nullptr; - if (HasRastT::value) - { - switch (tsState.postDSTopology) - { - case TOP_TRIANGLE_LIST: - pfnClipFunc = ClipTriangles; - break; - case TOP_LINE_LIST: - pfnClipFunc = ClipLines; - break; - case TOP_POINT_LIST: - pfnClipFunc = ClipPoints; - break; - default: - SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology); - } - } - -#endif - SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext; - hsContext.PrimitiveID = primID; - hsContext.outputSize = tsState.hsAllocationSize; - - uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); - // Max storage for one attribute for an entire simdprimitive - simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM]; - - // Assemble position separately - // TESS_TODO: this could be avoided - fix it - pa.Assemble(VERTEX_POSITION_SLOT, simdattrib); - for (uint32_t i = 0; i < numVertsPerPrim; ++i) { - hsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = simdattrib[i]; - } - - // assemble all attributes for the input primitives - for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot) - { - uint32_t attribSlot = tsState.srcVertexAttribOffset + slot; - pa.Assemble(attribSlot, simdattrib); - - for (uint32_t i = 0; i < numVertsPerPrim; ++i) - { - hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i]; - } - } - - // Allocate HS output storage - uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize; - - if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize) - { - AlignedFree(gt_pTessellationThreadData->pHSOutput); - gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64); - gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize; - } - - hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput; - -#if defined(_DEBUG) - //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); -#endif - memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); - -#if USE_SIMD16_FRONTEND - uint32_t numPrims = numPrims_simd8; -#else - uint32_t numPrims = pa.NumPrims(); -#endif - hsContext.mask = GenerateMask(numPrims); - - // Run the HS - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEHullShader, pDC->drawId); - state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext); - RDTSC_END(pDC->pContext->pBucketMgr, FEHullShader, 0); - - UPDATE_STAT_FE(HsInvocations, numPrims); - AR_EVENT(HSStats((HANDLE)&hsContext.stats)); - - const uint32_t* pPrimId = (const uint32_t*)&primID; - - for (uint32_t p = 0; p < numPrims; ++p) - { - ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p); - - SWR_TESSELLATION_FACTORS tessFactors; - tessFactors = hsContext.pCPout[p].tessFactors; - - // Run Tessellator - SWR_TS_TESSELLATED_DATA tsData = {0}; - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId); - TSTessellate(tsCtx, tessFactors, tsData); - AR_EVENT(TessPrimCount(1)); - RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0); - - if (tsData.NumPrimitives == 0) - { - continue; - } - SWR_ASSERT(tsData.NumDomainPoints); - - // Allocate DS Output memory - uint32_t requiredDSVectorInvocations = - AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; -#if USE_SIMD16_FRONTEND - size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * - tsState.dsAllocationSize; // simd8 -> simd16, padding -#else - size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize; - size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors; -#endif - if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize) - { - AlignedFree(gt_pTessellationThreadData->pDSOutput); - gt_pTessellationThreadData->pDSOutput = - (simdscalar*)AlignedMalloc(requiredAllocSize, 64); - gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize; - } - SWR_ASSERT(gt_pTessellationThreadData->pDSOutput); - SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize); - -#if defined(_DEBUG) - memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize); -#endif - - // Run Domain Shader - SWR_DS_CONTEXT dsContext; - dsContext.PrimitiveID = pPrimId[p]; - dsContext.pCpIn = pCPout; - dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; - dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; - dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; - dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset; -#if USE_SIMD16_FRONTEND - dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16 -#else - dsContext.vectorStride = requiredDSVectorInvocations; -#endif - - uint32_t dsInvocations = 0; - - for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; - ++dsContext.vectorOffset) - { - dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations); - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEDomainShader, pDC->drawId); - state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext); - RDTSC_END(pDC->pContext->pBucketMgr, FEDomainShader, 0); - - AR_EVENT(DSStats((HANDLE)&dsContext.stats)); - - dsInvocations += KNOB_SIMD_WIDTH; - } - UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints); - -#if USE_SIMD16_FRONTEND - SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16 - -#endif - PA_TESS tessPa( - pDC, -#if USE_SIMD16_FRONTEND - reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16 - dsContext.vectorStride / 2, // simd8 -> simd16 -#else - dsContext.pOutputData, - dsContext.vectorStride, -#endif - SWR_VTX_NUM_SLOTS, - tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset, - tsData.ppIndices, - tsData.NumPrimitives, - tsState.postDSTopology, - NumVertsPerPrim(tsState.postDSTopology, false)); - - while (tessPa.HasWork()) - { -#if USE_SIMD16_FRONTEND - const uint32_t numPrims = tessPa.NumPrims(); - const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); - const uint32_t numPrims_hi = - std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; - - const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID); - const simdscalari primID_lo = _simd16_extract_si(primID, 0); - const simdscalari primID_hi = _simd16_extract_si(primID, 1); - -#endif - if (HasGeometryShaderT::value) - { -#if USE_SIMD16_FRONTEND - tessPa.useAlternateOffset = false; - GeometryShaderStage<HasStreamOutT, HasRastT>( - pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo); - - if (numPrims_hi) - { - tessPa.useAlternateOffset = true; - GeometryShaderStage<HasStreamOutT, HasRastT>( - pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi); - } -#else - GeometryShaderStage<HasStreamOutT, HasRastT>( - pDC, - workerId, - tessPa, - pGsBuffers, - pSoPrimData, - _simd_set1_epi32(dsContext.PrimitiveID)); -#endif - } - else - { - if (HasStreamOutT::value) - { -#if ENABLE_AVX512_SIMD16 - tessPa.useAlternateOffset = false; -#endif - StreamOut(pDC, tessPa, workerId, pSoPrimData, 0); - } - - if (HasRastT::value) - { -#if USE_SIMD16_FRONTEND - simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points -#else - simdvector prim[3]; // Only deal with triangles, lines, or points -#endif - RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEPAAssemble, pDC->drawId); - bool assemble = -#if USE_SIMD16_FRONTEND - tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16); -#else - tessPa.Assemble(VERTEX_POSITION_SLOT, prim); -#endif - RDTSC_END(pDC->pContext->pBucketMgr, FEPAAssemble, 1); - SWR_ASSERT(assemble); - - SWR_ASSERT(pfnClipFunc); -#if USE_SIMD16_FRONTEND - // Gather data from the SVG if provided. - simd16scalari vViewportIdx = SIMD16::setzero_si(); - simd16scalari vRtIdx = SIMD16::setzero_si(); - SIMD16::Vec4 svgAttrib[4] = {SIMD16::setzero_ps()}; - - if (state.backendState.readViewportArrayIndex || - state.backendState.readRenderTargetArrayIndex) - { - tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); - } - - if (state.backendState.readViewportArrayIndex) - { - vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); - tessPa.viewportArrayActive = true; - } - if (state.backendState.readRenderTargetArrayIndex) - { - vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); - tessPa.rtArrayActive = true; - } - - - { - // OOB VPAI indices => forced to zero. - vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si()); - simd16scalari vNumViewports = - SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports); - vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx); - - tessPa.useAlternateOffset = false; - pfnClipFunc(pDC, - tessPa, - workerId, - prim_simd16, - GenMask(numPrims), - primID, - vViewportIdx, - vRtIdx); - } -#else - // Gather data from the SGV if provided. - simdscalari vViewportIdx = SIMD::setzero_si(); - simdscalari vRtIdx = SIMD::setzero_si(); - SIMD::Vec4 svgAttrib[4]; - - if (state.backendState.readViewportArrayIndex || - state.backendState.readRenderTargetArrayIndex) - { - tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); - } - - if (state.backendState.readViewportArrayIndex) - { - vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); - - // OOB VPAI indices => forced to zero. - vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si()); - simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports); - vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx); - tessPa.viewportArrayActive = true; - } - if (state.backendState.readRenderTargetArrayIndex) - { - vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); - tessPa.rtArrayActive = true; - } - pfnClipFunc(pDC, - tessPa, - workerId, - prim, - GenMask(tessPa.NumPrims()), - _simd_set1_epi32(dsContext.PrimitiveID), - vViewportIdx, - vRtIdx); -#endif - } - } - - tessPa.NextPrim(); - - } // while (tessPa.HasWork()) - } // for (uint32_t p = 0; p < numPrims; ++p) - -#if USE_SIMD16_FRONTEND - if (gt_pTessellationThreadData->pDSOutput != nullptr) - { - AlignedFree(gt_pTessellationThreadData->pDSOutput); - gt_pTessellationThreadData->pDSOutput = nullptr; - } - gt_pTessellationThreadData->dsOutputAllocSize = 0; - -#endif - TSDestroyCtx(tsCtx); -} - -THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr; -THREAD uint32_t gVertexStoreSize = 0; - -////////////////////////////////////////////////////////////////////////// -/// @brief FE handler for SwrDraw. -/// @tparam IsIndexedT - Is indexed drawing enabled -/// @tparam HasTessellationT - Is tessellation enabled -/// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled -/// @tparam HasStreamOutT - Is stream-out enabled -/// @tparam HasRastT - Is rasterization enabled -/// @param pContext - pointer to SWR context. -/// @param pDC - pointer to draw context. -/// @param workerId - thread's worker id. -/// @param pUserData - Pointer to DRAW_WORK -template <typename IsIndexedT, - typename IsCutIndexEnabledT, - typename HasTessellationT, - typename HasGeometryShaderT, - typename HasStreamOutT, - typename HasRastT> -void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData) -{ -#if KNOB_ENABLE_TOSS_POINTS - if (KNOB_TOSS_QUEUE_FE) - { - return; - } -#endif - - RDTSC_BEGIN(pContext->pBucketMgr, FEProcessDraw, pDC->drawId); - - void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - DRAW_WORK& work = *(DRAW_WORK*)pUserData; - const API_STATE& state = GetApiState(pDC); - - uint32_t indexSize = 0; - uint32_t endVertex = work.numVerts; - - gfxptr_t xpLastRequestedIndex = 0; - if (IsIndexedT::value) - { - switch (work.type) - { - case R32_UINT: - indexSize = sizeof(uint32_t); - break; - case R16_UINT: - indexSize = sizeof(uint16_t); - break; - case R8_UINT: - indexSize = sizeof(uint8_t); - break; - default: - SWR_INVALID("Invalid work.type: %d", work.type); - } - xpLastRequestedIndex = work.xpIB + endVertex * indexSize; - } - else - { - // No cuts, prune partial primitives. - endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts)); - } - -#if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR) - uint32_t numPrims = GetNumPrims(state.topology, work.numVerts); -#endif - - GsBuffers gsBuffers; - if (HasGeometryShaderT::value) - { -#if USE_SIMD16_FRONTEND - AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>( - pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers); -#else - AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>( - pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers); -#endif - } - - if (HasTessellationT::value) - { - SWR_ASSERT(state.tsState.tsEnable == true); - SWR_ASSERT(state.pfnHsFunc != nullptr); - SWR_ASSERT(state.pfnDsFunc != nullptr); - - AllocateTessellationData(pContext); - } - else - { - SWR_ASSERT(state.tsState.tsEnable == false); - SWR_ASSERT(state.pfnHsFunc == nullptr); - SWR_ASSERT(state.pfnDsFunc == nullptr); - } - - // allocate space for streamout input prim data - uint32_t* pSoPrimData = nullptr; - if (HasStreamOutT::value) - { - pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16); - } - - const uint32_t vertexCount = NumVertsPerPrim(state.topology, true); -#if USE_SIMD16_FRONTEND - uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector); -#else - uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector); -#endif - - SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM); - - // Compute storage requirements for vertex store - // TODO: allocation needs to be rethought for better cut support - uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine - uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes; - - // grow the vertex store for the PA as necessary - if (gVertexStoreSize < vertexStoreSize) - { - if (gpVertexStore != nullptr) - { - AlignedFree(gpVertexStore); - gpVertexStore = nullptr; - } - - SWR_ASSERT(gpVertexStore == nullptr); - - gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64)); - gVertexStoreSize = vertexStoreSize; - - SWR_ASSERT(gpVertexStore != nullptr); - } - - // choose primitive assembler - - PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, - state.topology, - work.numVerts, - gpVertexStore, - numVerts, - state.frontendState.vsVertexSize, - GetNumVerts(state.topology, 1)); - PA_STATE& pa = paFactory.GetPA(); - -#if USE_SIMD16_FRONTEND -#if USE_SIMD16_SHADERS - simd16vertex vin; -#else - simdvertex vin_lo; - simdvertex vin_hi; -#endif - SWR_VS_CONTEXT vsContext_lo; - SWR_VS_CONTEXT vsContext_hi; - -#if USE_SIMD16_SHADERS - vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin); - vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin); -#else - vsContext_lo.pVin = &vin_lo; - vsContext_hi.pVin = &vin_hi; -#endif - vsContext_lo.AlternateOffset = 0; - vsContext_hi.AlternateOffset = 1; - - SWR_FETCH_CONTEXT fetchInfo_lo = {0}; - - fetchInfo_lo.pStreams = &state.vertexBuffers[0]; - fetchInfo_lo.StartInstance = work.startInstance; - fetchInfo_lo.StartVertex = 0; - - if (IsIndexedT::value) - { - fetchInfo_lo.BaseVertex = work.baseVertex; - - // if the entire index buffer isn't being consumed, set the last index - // so that fetches < a SIMD wide will be masked off - fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size; - if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex) - { - fetchInfo_lo.xpLastIndex = xpLastRequestedIndex; - } - } - else - { - fetchInfo_lo.StartVertex = work.startVertex; - } - - SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo; - - const simd16scalari vScale = - _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - - for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) - { - uint32_t i = 0; - - simd16scalari vIndex; - - if (IsIndexedT::value) - { - fetchInfo_lo.xpIndices = work.xpIB; - fetchInfo_hi.xpIndices = - fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH - } - else - { - vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale); - - fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex); - - int32_t* sysAddr = reinterpret_cast<int32_t*>(&vIndex); - sysAddr += KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH - - fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), sysAddr); - } - - fetchInfo_lo.CurInstance = instanceNum; - fetchInfo_hi.CurInstance = instanceNum; - - vsContext_lo.InstanceID = instanceNum; - vsContext_hi.InstanceID = instanceNum; - - while (pa.HasWork()) - { - // GetNextVsOutput currently has the side effect of updating some PA state machine - // state. So we need to keep this outside of (i < endVertex) check. - - simdmask* pvCutIndices_lo = nullptr; - simdmask* pvCutIndices_hi = nullptr; - - if (IsIndexedT::value) - { - // simd16mask <=> simdmask[2] - - pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0]; - pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1]; - } - - simd16vertex& vout = pa.GetNextVsOutput(); - - vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout); - vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout); - - if (i < endVertex) - { - if (!IsIndexedT::value) - { - fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices; - uint32_t offset; - offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH); - offset *= 4; // convert from index to address -#if USE_SIMD16_SHADERS - fetchInfo_lo.xpLastIndex += offset; -#else - fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH); - uint32_t offset2 = - std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH; - assert(offset >= 0); - fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices; - fetchInfo_hi.xpLastIndex += offset2; -#endif - } - // 1. Execute FS/VS for a single SIMD. - RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId); -#if USE_SIMD16_SHADERS - state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin); -#else - state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo); - - if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH - { - state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi); - } -#endif - RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0); - - // forward fetch generated vertex IDs to the vertex shader -#if USE_SIMD16_SHADERS -#if USE_SIMD16_VS - vsContext_lo.VertexID16 = - _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0); - vsContext_lo.VertexID16 = - _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1); -#else - vsContext_lo.VertexID = fetchInfo_lo.VertexID; - vsContext_hi.VertexID = fetchInfo_lo.VertexID2; -#endif -#else - vsContext_lo.VertexID = fetchInfo_lo.VertexID; - vsContext_hi.VertexID = fetchInfo_hi.VertexID; -#endif - - // Setup active mask for vertex shader. -#if USE_SIMD16_VS - vsContext_lo.mask16 = GenerateMask16(endVertex - i); -#else - vsContext_lo.mask = GenerateMask(endVertex - i); - vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH)); -#endif - - // forward cut mask to the PA - if (IsIndexedT::value) - { -#if USE_SIMD16_SHADERS - *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); - *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2)); -#else - *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); - *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask)); -#endif - } - - UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex)); - -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_FETCH) -#endif - { - RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId); -#if USE_SIMD16_VS - state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo); - AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats)); -#else - state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo); - AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats)); - - if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH - { - state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi); - AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats)); - } -#endif - RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0); - - UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex)); - } - } - - // 2. Assemble primitives given the last two SIMD. - do - { - simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM]; - - RDTSC_START(pContext->pBucketMgr, FEPAAssemble); - bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16); - RDTSC_STOP(pContext->pBucketMgr, FEPAAssemble, 1, 0); - -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_FETCH) -#endif - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_VS) -#endif - { - if (assemble) - { - UPDATE_STAT_FE(IaPrimitives, pa.NumPrims()); - - const uint32_t numPrims = pa.NumPrims(); - const uint32_t numPrims_lo = - std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); - const uint32_t numPrims_hi = - std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; - - const simd16scalari primID = pa.GetPrimID(work.startPrimID); - const simdscalari primID_lo = _simd16_extract_si(primID, 0); - const simdscalari primID_hi = _simd16_extract_si(primID, 1); - - if (HasTessellationT::value) - { - pa.useAlternateOffset = false; - TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>( - pDC, - workerId, - pa, - &gsBuffers, - pSoPrimData, - numPrims_lo, - primID_lo); - - if (numPrims_hi) - { - pa.useAlternateOffset = true; - TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>( - pDC, - workerId, - pa, - &gsBuffers, - pSoPrimData, - numPrims_hi, - primID_hi); - } - } - else if (HasGeometryShaderT::value) - { - pa.useAlternateOffset = false; - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, - workerId, - pa, - &gsBuffers, - pSoPrimData, - numPrims_lo, - primID_lo); - - if (numPrims_hi) - { - pa.useAlternateOffset = true; - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, - workerId, - pa, - &gsBuffers, - pSoPrimData, - numPrims_hi, - primID_hi); - } - } - else - { - // If streamout is enabled then stream vertices out to memory. - if (HasStreamOutT::value) - { - pa.useAlternateOffset = false; - StreamOut(pDC, pa, workerId, pSoPrimData, 0); - } - - if (HasRastT::value) - { - SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16); - // Gather data from the SVG if provided. - simd16scalari vpai = SIMD16::setzero_si(); - simd16scalari rtai = SIMD16::setzero_si(); - SIMD16::Vec4 svgAttrib[4]; - - if (state.backendState.readViewportArrayIndex || - state.backendState.readRenderTargetArrayIndex) - { - pa.Assemble(VERTEX_SGV_SLOT, svgAttrib); - } - - if (state.backendState.readViewportArrayIndex) - { - vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); - pa.viewportArrayActive = true; - } - if (state.backendState.readRenderTargetArrayIndex) - { - rtai = - SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); - pa.rtArrayActive = true; - } - - { - // OOB VPAI indices => forced to zero. - vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si()); - simd16scalari vNumViewports = - SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simd16scalari vClearMask = - SIMD16::cmplt_epi32(vpai, vNumViewports); - vpai = SIMD16::and_si(vClearMask, vpai); - - pa.useAlternateOffset = false; - pDC->pState->pfnProcessPrims_simd16(pDC, - pa, - workerId, - prim_simd16, - GenMask(numPrims), - primID, - vpai, - rtai); - } - } - } - } - } - } - } while (pa.NextPrim()); - - if (IsIndexedT::value) - { - fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize; - fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize; - } - else - { - vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH)); - } - - i += KNOB_SIMD16_WIDTH; - } - - pa.Reset(); - } - -#else - SWR_VS_CONTEXT vsContext; - SWR_FETCH_CONTEXT fetchInfo = {0}; - - fetchInfo.pStreams = &state.vertexBuffers[0]; - fetchInfo.StartInstance = work.startInstance; - fetchInfo.StartVertex = 0; - - if (IsIndexedT::value) - { - fetchInfo.BaseVertex = work.baseVertex; - - // if the entire index buffer isn't being consumed, set the last index - // so that fetches < a SIMD wide will be masked off - fetchInfo.pLastIndex = - (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); - if (xpLastRequestedIndex < fetchInfo.pLastIndex) - { - fetchInfo.pLastIndex = xpLastRequestedIndex; - } - } - else - { - fetchInfo.StartVertex = work.startVertex; - } - - const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - - /// @todo: temporarily move instance loop in the FE to ensure SO ordering - for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) - { - simdscalari vIndex; - uint32_t i = 0; - - if (IsIndexedT::value) - { - fetchInfo.pIndices = work.pIB; - } - else - { - vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale); - fetchInfo.pIndices = (const int32_t*)&vIndex; - } - - fetchInfo.CurInstance = instanceNum; - vsContext.InstanceID = instanceNum; - - while (pa.HasWork()) - { - // GetNextVsOutput currently has the side effect of updating some PA state machine - // state. So we need to keep this outside of (i < endVertex) check. - simdmask* pvCutIndices = nullptr; - if (IsIndexedT::value) - { - pvCutIndices = &pa.GetNextVsIndices(); - } - - simdvertex& vout = pa.GetNextVsOutput(); - vsContext.pVin = &vout; - vsContext.pVout = &vout; - - if (i < endVertex) - { - // 1. Execute FS/VS for a single SIMD. - RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId); - state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout); - RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0); - - // forward fetch generated vertex IDs to the vertex shader - vsContext.VertexID = fetchInfo.VertexID; - - // Setup active mask for vertex shader. - vsContext.mask = GenerateMask(endVertex - i); - - // forward cut mask to the PA - if (IsIndexedT::value) - { - *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask)); - } - - UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex)); - -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_FETCH) -#endif - { - RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId); - state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext); - RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0); - - UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex)); - AR_EVENT(VSStats((HANDLE)&vsContext.stats)); - } - } - - // 2. Assemble primitives given the last two SIMD. - do - { - simdvector prim[MAX_NUM_VERTS_PER_PRIM]; - // PaAssemble returns false if there is not enough verts to assemble. - RDTSC_BEGIN(pContext->pBucketMgr, FEPAAssemble, pDC->drawId); - bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim); - RDTSC_END(pContext->pBucketMgr, FEPAAssemble, 1); - -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_FETCH) -#endif - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_VS) -#endif - { - if (assemble) - { - UPDATE_STAT_FE(IaPrimitives, pa.NumPrims()); - - if (HasTessellationT::value) - { - TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>( - pDC, - workerId, - pa, - &gsBuffers, - pSoPrimData, - pa.GetPrimID(work.startPrimID)); - } - else if (HasGeometryShaderT::value) - { - GeometryShaderStage<HasStreamOutT, HasRastT>( - pDC, - workerId, - pa, - &gsBuffers, - pSoPrimData, - pa.GetPrimID(work.startPrimID)); - } - else - { - // If streamout is enabled then stream vertices out to memory. - if (HasStreamOutT::value) - { - StreamOut(pDC, pa, workerId, pSoPrimData, 0); - } - - if (HasRastT::value) - { - SWR_ASSERT(pDC->pState->pfnProcessPrims); - - // Gather data from the SVG if provided. - simdscalari vViewportIdx = SIMD::setzero_si(); - simdscalari vRtIdx = SIMD::setzero_si(); - SIMD::Vec4 svgAttrib[4]; - - if (state.backendState.readViewportArrayIndex || - state.backendState.readRenderTargetArrayIndex) - { - pa.Assemble(VERTEX_SGV_SLOT, svgAttrib); - } - - if (state.backendState.readViewportArrayIndex) - { - vViewportIdx = - SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); - - // OOB VPAI indices => forced to zero. - vViewportIdx = - SIMD::max_epi32(vViewportIdx, SIMD::setzero_si()); - simdscalari vNumViewports = - SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simdscalari vClearMask = - SIMD::cmplt_epi32(vViewportIdx, vNumViewports); - vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx); - pa.viewportArrayActive = true; - } - if (state.backendState.readRenderTargetArrayIndex) - { - vRtIdx = - SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); - pa.rtArrayActive = true; - } - - pDC->pState->pfnProcessPrims(pDC, - pa, - workerId, - prim, - GenMask(pa.NumPrims()), - pa.GetPrimID(work.startPrimID), - vViewportIdx, - vRtIdx); - } - } - } - } - } - } while (pa.NextPrim()); - - if (IsIndexedT::value) - { - fetchInfo.pIndices = - (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); - } - else - { - vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH)); - } - - i += KNOB_SIMD_WIDTH; - } - pa.Reset(); - } - -#endif - - RDTSC_END(pContext->pBucketMgr, FEProcessDraw, numPrims * work.numInstances); -} - -struct FEDrawChooser -{ - typedef PFN_FE_WORK_FUNC FuncType; - - template <typename... ArgsB> - static FuncType GetFunc() - { - return ProcessDraw<ArgsB...>; - } -}; - -// Selector for correct templated Draw front-end function -PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed, - bool IsCutIndexEnabled, - bool HasTessellation, - bool HasGeometryShader, - bool HasStreamOut, - bool HasRasterization) -{ - return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, - IsCutIndexEnabled, - HasTessellation, - HasGeometryShader, - HasStreamOut, - HasRasterization); -} diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h deleted file mode 100644 index a6d9fb5ba52..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ /dev/null @@ -1,448 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file frontend.h - * - * @brief Definitions for Frontend which handles vertex processing, - * primitive assembly, clipping, binning, etc. - * - ******************************************************************************/ -#pragma once -#include "context.h" -#include "common/simdintrin.h" -#include <type_traits> - -////////////////////////////////////////////////////////////////////////// -/// @brief Helper macro to generate a bitmask -static INLINE uint32_t - GenMask(uint32_t numBits) -{ - SWR_ASSERT( - numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__); - return ((1U << numBits) - 1); -} - -// Calculates the A and B coefficients for the 3 edges of the triangle -// -// maths for edge equations: -// standard form of a line in 2d -// Ax + By + C = 0 -// A = y0 - y1 -// B = x1 - x0 -// C = x0y1 - x1y0 -INLINE -void triangleSetupAB(const __m128 vX, const __m128 vY, __m128& vA, __m128& vB) -{ - // vYsub = y1 y2 y0 dc - __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1)); - // vY = y0 y1 y2 dc - vA = _mm_sub_ps(vY, vYsub); - - // Result: - // A[0] = y0 - y1 - // A[1] = y1 - y2 - // A[2] = y2 - y0 - - // vXsub = x1 x2 x0 dc - __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1)); - // vX = x0 x1 x2 dc - vB = _mm_sub_ps(vXsub, vX); - - // Result: - // B[0] = x1 - x0 - // B[1] = x2 - x1 - // B[2] = x0 - x2 -} - -INLINE -void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i& vA, __m128i& vB) -{ - // generate edge equations - // A = y0 - y1 - // B = x1 - x0 - // C = x0y1 - x1y0 - __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1)); - vA = _mm_sub_epi32(vY, vYsub); - - __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1)); - vB = _mm_sub_epi32(vXsub, vX); -} - -INLINE -void triangleSetupABIntVertical(const simdscalari vX[3], - const simdscalari vY[3], - simdscalari (&vA)[3], - simdscalari (&vB)[3]) -{ - // A = y0 - y1 - // B = x1 - x0 - vA[0] = _simd_sub_epi32(vY[0], vY[1]); - vA[1] = _simd_sub_epi32(vY[1], vY[2]); - vA[2] = _simd_sub_epi32(vY[2], vY[0]); - - vB[0] = _simd_sub_epi32(vX[1], vX[0]); - vB[1] = _simd_sub_epi32(vX[2], vX[1]); - vB[2] = _simd_sub_epi32(vX[0], vX[2]); -} - -#if ENABLE_AVX512_SIMD16 -INLINE -void triangleSetupABIntVertical(const simd16scalari vX[3], - const simd16scalari vY[3], - simd16scalari (&vA)[3], - simd16scalari (&vB)[3]) -{ - // A = y0 - y1 - // B = x1 - x0 - vA[0] = _simd16_sub_epi32(vY[0], vY[1]); - vA[1] = _simd16_sub_epi32(vY[1], vY[2]); - vA[2] = _simd16_sub_epi32(vY[2], vY[0]); - - vB[0] = _simd16_sub_epi32(vX[1], vX[0]); - vB[1] = _simd16_sub_epi32(vX[2], vX[1]); - vB[2] = _simd16_sub_epi32(vX[0], vX[2]); -} - -#endif -// Calculate the determinant of the triangle -// 2 vectors between the 3 points: P, Q -// Px = x0-x2, Py = y0-y2 -// Qx = x1-x2, Qy = y1-y2 -// |Px Qx| -// det = | | = PxQy - PyQx -// |Py Qy| -// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2) -// try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx -// : B[2]*A[1] - (-(y2-y0))*(-(x2-x1)) -// : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1) -// : B[2]*A[1] - A[2]*B[1] -INLINE -float calcDeterminantInt(const __m128i vA, const __m128i vB) -{ - // vAShuf = [A1, A0, A2, A0] - __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1)); - // vBShuf = [B2, B0, B1, B0] - __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2)); - // vMul = [A1*B2, B1*A2] - __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf); - - // shuffle upper to lower - // vMul2 = [B1*A2, B1*A2] - __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2)); - // vMul = [A1*B2 - B1*A2] - vMul = _mm_sub_epi64(vMul, vMul2); - - int64_t result; - _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul)); - - double dResult = (double)result; - dResult = dResult * (1.0 / FIXED_POINT16_SCALE); - - return (float)dResult; -} - -INLINE -void calcDeterminantIntVertical(const simdscalari vA[3], - const simdscalari vB[3], - simdscalari* pvDet) -{ - // refer to calcDeterminantInt comment for calculation explanation - - // A1*B2 - simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 - simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 - - simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]); - simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]); - - simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 - simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 - - // B1*A2 - simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]); - simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]); - - simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]); - simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]); - - simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo); - simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi); - - // A1*B2 - A2*B1 - simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo); - simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi); - - // shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3 - simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20); - - // shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7 - simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31); - - pvDet[0] = vResultLo; - pvDet[1] = vResultHi; -} - -#if ENABLE_AVX512_SIMD16 -INLINE -void calcDeterminantIntVertical(const simd16scalari vA[3], - const simd16scalari vB[3], - simd16scalari* pvDet) -{ - // refer to calcDeterminantInt comment for calculation explanation - - // A1*B2 - simd16scalari vA1_lo = - _simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b) - simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F - - simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]); - simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]); - - simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b) - simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F - - // B1*A2 - simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]); - simd16scalari vA2_hi = _simd16_unpackhi_epi32(vA[2], vA[2]); - - simd16scalari vB1_lo = _simd16_unpacklo_epi32(vB[1], vB[1]); - simd16scalari vB1_hi = _simd16_unpackhi_epi32(vB[1], vB[1]); - - simd16scalari vA2B1_lo = _simd16_mul_epi32(vA2_lo, vB1_lo); - simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi); - - // A1*B2 - A2*B1 - simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b) - simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F - - // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE - simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b) - simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F - - // (3, 1, 2, 0) = 11 01 10 00 = 0xD8 - pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b) - pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F -} - -#endif -INLINE -void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128& vB, __m128& vC) -{ - // C = -Ax - By - vC = _mm_mul_ps(vA, vX); - __m128 vCy = _mm_mul_ps(vB, vY); - vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f)); - vC = _mm_sub_ps(vC, vCy); -} - -template <uint32_t NumVerts> -INLINE void viewportTransform(simdvector* v, const SWR_VIEWPORT_MATRICES& vpMatrices) -{ - simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]); - simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]); - simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]); - simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]); - simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]); - simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]); - - for (uint32_t i = 0; i < NumVerts; ++i) - { - v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); - v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); - v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); - } -} - -#if USE_SIMD16_FRONTEND -template <uint32_t NumVerts> -INLINE void viewportTransform(simd16vector* v, const SWR_VIEWPORT_MATRICES& vpMatrices) -{ - const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]); - const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]); - const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]); - const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]); - const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]); - const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]); - - for (uint32_t i = 0; i < NumVerts; ++i) - { - v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30); - v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31); - v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32); - } -} - -#endif -template <uint32_t NumVerts> -INLINE void viewportTransform(simdvector* v, - const SWR_VIEWPORT_MATRICES& vpMatrices, - simdscalari const& vViewportIdx) -{ - // perform a gather of each matrix element based on the viewport array indexes - simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4); - simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4); - simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4); - simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4); - simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4); - simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4); - - for (uint32_t i = 0; i < NumVerts; ++i) - { - v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); - v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); - v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); - } -} - -#if USE_SIMD16_FRONTEND -template <uint32_t NumVerts> -INLINE void viewportTransform(simd16vector* v, - const SWR_VIEWPORT_MATRICES& vpMatrices, - simd16scalari const& vViewportIdx) -{ - // perform a gather of each matrix element based on the viewport array indexes - const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4); - const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4); - const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4); - const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4); - const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4); - const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4); - - for (uint32_t i = 0; i < NumVerts; ++i) - { - v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30); - v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31); - v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32); - } -} - -#endif -INLINE -void calcBoundingBoxInt(const __m128i& vX, const __m128i& vY, SWR_RECT& bbox) -{ - // Need horizontal fp min here - __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1)); - __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2)); - - __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1)); - __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2)); - - __m128i vMinX = _mm_min_epi32(vX, vX1); - vMinX = _mm_min_epi32(vMinX, vX2); - - __m128i vMaxX = _mm_max_epi32(vX, vX1); - vMaxX = _mm_max_epi32(vMaxX, vX2); - - __m128i vMinY = _mm_min_epi32(vY, vY1); - vMinY = _mm_min_epi32(vMinY, vY2); - - __m128i vMaxY = _mm_max_epi32(vY, vY1); - vMaxY = _mm_max_epi32(vMaxY, vY2); - - bbox.xmin = _mm_extract_epi32(vMinX, 0); - bbox.xmax = _mm_extract_epi32(vMaxX, 0); - bbox.ymin = _mm_extract_epi32(vMinY, 0); - bbox.ymax = _mm_extract_epi32(vMaxY, 0); -} - -INLINE -bool CanUseSimplePoints(DRAW_CONTEXT* pDC) -{ - const API_STATE& state = GetApiState(pDC); - - return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X && - state.rastState.pointSize == 1.0f && !state.rastState.pointParam && - !state.rastState.pointSpriteEnable && !state.backendState.clipDistanceMask); -} - -INLINE -bool vHasNaN(const __m128& vec) -{ - const __m128 result = _mm_cmpunord_ps(vec, vec); - const int32_t mask = _mm_movemask_ps(result); - return (mask != 0); -} - -uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements); -uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts); - -// ProcessDraw front-end function. All combinations of parameter values are available -PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed, - bool IsCutIndexEnabled, - bool HasTessellation, - bool HasGeometryShader, - bool HasStreamOut, - bool HasRasterization); - -void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData); -void ProcessStoreTiles(SWR_CONTEXT* pContext, - DRAW_CONTEXT* pDC, - uint32_t workerId, - void* pUserData); -void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext, - DRAW_CONTEXT* pDC, - uint32_t workerId, - void* pUserData); -void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData); -void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData); - -PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative); -#if USE_SIMD16_FRONTEND -PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative); -#endif - -struct PA_STATE_BASE; // forward decl -void BinPoints(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[3], - uint32_t primMask, - simdscalari const& primID, - simdscalari const& viewportIdx, - simdscalari const& rtIdx); -void BinLines(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prims[3], - uint32_t primMask, - simdscalari const& primID, - simdscalari const& viewportIdx, - simdscalari const& rtIdx); -#if USE_SIMD16_FRONTEND -void SIMDCALL BinPoints_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[3], - uint32_t primMask, - simd16scalari const& primID, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx); -void SIMDCALL BinLines_simd16(DRAW_CONTEXT* pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prims[3], - uint32_t primMask, - simd16scalari const& primID, - simd16scalari const& viewportIdx, - simd16scalari const& rtIdx); -#endif - diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h deleted file mode 100644 index 798e5684025..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ /dev/null @@ -1,175 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file knobs.h - * - * @brief Static (Compile-Time) Knobs for Core. - * - ******************************************************************************/ -#pragma once - -#include <stdint.h> -#include <gen_knobs.h> - -#define KNOB_ARCH_AVX 0 -#define KNOB_ARCH_AVX2 1 -#define KNOB_ARCH_AVX512 2 - -/////////////////////////////////////////////////////////////////////////////// -// AVX512 Support -/////////////////////////////////////////////////////////////////////////////// - -#define ENABLE_AVX512_SIMD16 1 -#define USE_SIMD16_FRONTEND 1 -#define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND -#define USE_SIMD16_VS 1 // requires USE_SIMD16_SHADERS - -/////////////////////////////////////////////////////////////////////////////// -// Architecture validation -/////////////////////////////////////////////////////////////////////////////// -#if !defined(KNOB_ARCH) -#define KNOB_ARCH KNOB_ARCH_AVX -#endif - -#if (KNOB_ARCH == KNOB_ARCH_AVX) -#define KNOB_ARCH_ISA AVX -#define KNOB_ARCH_STR "AVX" -#elif (KNOB_ARCH == KNOB_ARCH_AVX2) -#define KNOB_ARCH_ISA AVX2 -#define KNOB_ARCH_STR "AVX2" -#elif (KNOB_ARCH == KNOB_ARCH_AVX512) -#define KNOB_ARCH_ISA AVX512F -#define KNOB_ARCH_STR "AVX512" -#else -#error "Unknown architecture" -#endif - -#define KNOB_SIMD_WIDTH 8 -#define KNOB_SIMD_BYTES 32 - -#define KNOB_SIMD16_WIDTH 16 -#define KNOB_SIMD16_BYTES 64 - -#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING") - -/////////////////////////////////////////////////////////////////////////////// -// Configuration knobs -/////////////////////////////////////////////////////////////////////////////// -// Maximum supported number of active vertex buffer streams -#define KNOB_NUM_STREAMS 32 - -// Maximum supported active viewports and scissors -#define KNOB_NUM_VIEWPORTS_SCISSORS 16 - -// Guardband range used by the clipper -#define KNOB_GUARDBAND_WIDTH 32768.0f -#define KNOB_GUARDBAND_HEIGHT 32768.0f - -// Scratch space requirements per worker. Currently only used for TGSM sizing for some stages -#define KNOB_WORKER_SCRATCH_SPACE_SIZE (32 * 1024) - -/////////////////////////////// -// Macro tile configuration -/////////////////////////////// - -// raster tile dimensions -#define KNOB_TILE_X_DIM 8 -#define KNOB_TILE_X_DIM_SHIFT 3 -#define KNOB_TILE_Y_DIM 8 -#define KNOB_TILE_Y_DIM_SHIFT 3 - -// fixed macrotile pixel dimension for now, eventually will be -// dynamically set based on tile format and pixel size -#define KNOB_MACROTILE_X_DIM 32 -#define KNOB_MACROTILE_Y_DIM 32 -#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 13 -#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 13 -#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8) -#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8) -#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT) -#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT) - -// total # of hot tiles available. This should be enough to -// fully render a 16kx16k 128bpp render target -#define KNOB_NUM_HOT_TILES_X 512 -#define KNOB_NUM_HOT_TILES_Y 512 -#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT -#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT -#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT - -// Max scissor rectangle -#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X* KNOB_MACROTILE_X_DIM -#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y* KNOB_MACROTILE_Y_DIM - -#if KNOB_SIMD_WIDTH == 8 && KNOB_TILE_X_DIM < 4 -#error "incompatible width/tile dimensions" -#endif - -#if ENABLE_AVX512_SIMD16 -#if KNOB_SIMD16_WIDTH == 16 && KNOB_TILE_X_DIM < 8 -#error "incompatible width/tile dimensions" -#endif -#endif - -#if KNOB_SIMD_WIDTH == 8 -#define SIMD_TILE_X_DIM 4 -#define SIMD_TILE_Y_DIM 2 -#else -#error "Invalid simd width" -#endif - -#if ENABLE_AVX512_SIMD16 -#if KNOB_SIMD16_WIDTH == 16 -#define SIMD16_TILE_X_DIM 8 -#define SIMD16_TILE_Y_DIM 2 -#else -#error "Invalid simd width" -#endif -#endif - -/////////////////////////////////////////////////////////////////////////////// -// Optimization knobs -/////////////////////////////////////////////////////////////////////////////// -#define KNOB_USE_FAST_SRGB TRUE - -// enables cut-aware primitive assembler -#define KNOB_ENABLE_CUT_AWARE_PA TRUE - -// enables early rasterization (useful for small triangles) -#if !defined(KNOB_ENABLE_EARLY_RAST) -#define KNOB_ENABLE_EARLY_RAST 1 -#endif - -#if KNOB_ENABLE_EARLY_RAST -#define ER_SIMD_TILE_X_SHIFT 2 -#define ER_SIMD_TILE_Y_SHIFT 2 -#endif - -/////////////////////////////////////////////////////////////////////////////// -// Debug knobs -/////////////////////////////////////////////////////////////////////////////// -//#define KNOB_ENABLE_RDTSC - -// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs. -#if !defined(KNOB_ENABLE_TOSS_POINTS) -#define KNOB_ENABLE_TOSS_POINTS 0 -#endif diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h deleted file mode 100644 index f8797a8f2bc..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h +++ /dev/null @@ -1,108 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file knobs_init.h - * - * @brief Dynamic Knobs Initialization for Core. - * - ******************************************************************************/ -#pragma once - -#include <core/knobs.h> -#include <stdlib.h> -#include <string.h> -#include <ctype.h> -#include <stdio.h> - -// Assume the type is compatible with a 32-bit integer -template <typename T> -static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue) -{ - uint32_t value = 0; - char* pStopped = nullptr; - value = strtoul(pOverride, &pStopped, 0); - if (pStopped != pOverride) - { - knobValue = static_cast<T>(value); - } -} - -static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue) -{ - size_t len = strlen(pOverride); - if (len == 1) - { - auto c = tolower(pOverride[0]); - if (c == 'y' || c == 't' || c == '1') - { - knobValue = true; - return; - } - if (c == 'n' || c == 'f' || c == '0') - { - knobValue = false; - return; - } - } - - // Try converting to a number and casting to bool - uint32_t value = 0; - char* pStopped = nullptr; - value = strtoul(pOverride, &pStopped, 0); - if (pStopped != pOverride) - { - knobValue = value != 0; - } -} - -static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue) -{ - float value = knobValue; - if (sscanf(pOverride, "%f", &value)) - { - knobValue = value; - } -} - -static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue) -{ - knobValue = pOverride; -} - -template <typename T> -static inline void InitKnob(T& knob) -{ - // Read environment variables - const char* pOverride = getenv(knob.Name()); - - if (pOverride) - { - auto knobValue = knob.DefaultValue(); - ConvertEnvToKnob(pOverride, knobValue); - knob.Value(knobValue); - } - else - { - // Set default value - knob.Value(knob.DefaultValue()); - } -} diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h deleted file mode 100644 index 3b23974a7f4..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/multisample.h +++ /dev/null @@ -1,459 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file multisample.h - * - ******************************************************************************/ - -#pragma once - -#include "context.h" -#include "format_traits.h" - -////////////////////////////////////////////////////////////////////////// -/// @brief convenience typedef for testing for single sample case -typedef std::integral_constant<int, 1> SingleSampleT; - -INLINE -SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples) -{ - switch (numSamples) - { - case 1: - return SWR_MULTISAMPLE_1X; - case 2: - return SWR_MULTISAMPLE_2X; - case 4: - return SWR_MULTISAMPLE_4X; - case 8: - return SWR_MULTISAMPLE_8X; - case 16: - return SWR_MULTISAMPLE_16X; - default: - assert(0); - return SWR_MULTISAMPLE_1X; - } -} - -// hardcoded offsets based on Direct3d standard multisample positions -// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner -// coords are 0.8 fixed point offsets from (0, 0) -template <SWR_MULTISAMPLE_COUNT sampleCount, bool isCenter = false> -struct MultisampleTraits -{ - INLINE static float X(uint32_t sampleNum) = delete; - INLINE static float Y(uint32_t sampleNum) = delete; - INLINE static simdscalari FullSampleMask() = delete; - - static const uint32_t numSamples = 0; -}; - -template <> -struct MultisampleTraits<SWR_MULTISAMPLE_1X, false> -{ - INLINE static float X(uint32_t sampleNum) { return samplePosX[sampleNum]; }; - INLINE static float Y(uint32_t sampleNum) { return samplePosY[sampleNum]; }; - INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); }; - - static const uint32_t numSamples = 1; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X; - static constexpr uint32_t samplePosXi[1] = {0x80}; - static constexpr uint32_t samplePosYi[1] = {0x80}; - static constexpr float samplePosX[1] = {0.5f}; - static constexpr float samplePosY[1] = {0.5f}; -}; - -template <> -struct MultisampleTraits<SWR_MULTISAMPLE_1X, true> -{ - INLINE static float X(uint32_t sampleNum) { return 0.5f; }; - INLINE static float Y(uint32_t sampleNum) { return 0.5f; }; - INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); }; - - static const uint32_t numSamples = 1; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X; - static constexpr uint32_t samplePosXi[1] = {0x80}; - static constexpr uint32_t samplePosYi[1] = {0x80}; - static constexpr float samplePosX[1] = {0.5f}; - static constexpr float samplePosY[1] = {0.5f}; -}; - -template <> -struct MultisampleTraits<SWR_MULTISAMPLE_2X, false> -{ - INLINE static float X(uint32_t sampleNum) - { - SWR_ASSERT(sampleNum < numSamples); - return samplePosX[sampleNum]; - }; - INLINE static float Y(uint32_t sampleNum) - { - SWR_ASSERT(sampleNum < numSamples); - return samplePosY[sampleNum]; - }; - INLINE static simdscalari FullSampleMask() - { - static const simdscalari mask = _simd_set1_epi32(0x3); - return mask; - } - - static const uint32_t numSamples = 2; - static const uint32_t numCoverageSamples = 2; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X; - static constexpr uint32_t samplePosXi[2] = {0xC0, 0x40}; - static constexpr uint32_t samplePosYi[2] = {0xC0, 0x40}; - static constexpr float samplePosX[2] = {0.75f, 0.25f}; - static constexpr float samplePosY[2] = {0.75f, 0.25f}; -}; - -template <> -struct MultisampleTraits<SWR_MULTISAMPLE_2X, true> -{ - INLINE static float X(uint32_t sampleNum) { return 0.5f; }; - INLINE static float Y(uint32_t sampleNum) { return 0.5f; }; - INLINE static simdscalari FullSampleMask() - { - static const simdscalari mask = _simd_set1_epi32(0x3); - return mask; - } - static const uint32_t numSamples = 2; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X; - static constexpr uint32_t samplePosXi[2] = {0x80, 0x80}; - static constexpr uint32_t samplePosYi[2] = {0x80, 0x80}; - static constexpr float samplePosX[2] = {0.5f, 0.5f}; - static constexpr float samplePosY[2] = {0.5f, 0.5f}; -}; - -template <> -struct MultisampleTraits<SWR_MULTISAMPLE_4X, false> -{ - INLINE static float X(uint32_t sampleNum) - { - SWR_ASSERT(sampleNum < numSamples); - return samplePosX[sampleNum]; - }; - INLINE static float Y(uint32_t sampleNum) - { - SWR_ASSERT(sampleNum < numSamples); - return samplePosY[sampleNum]; - }; - INLINE static simdscalari FullSampleMask() - { - static const simdscalari mask = _simd_set1_epi32(0xF); - return mask; - } - - static const uint32_t numSamples = 4; - static const uint32_t numCoverageSamples = 4; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X; - static constexpr uint32_t samplePosXi[4] = {0x60, 0xE0, 0x20, 0xA0}; - static constexpr uint32_t samplePosYi[4] = {0x20, 0x60, 0xA0, 0xE0}; - static constexpr float samplePosX[4] = {0.375f, 0.875f, 0.125f, 0.625f}; - static constexpr float samplePosY[4] = {0.125f, 0.375f, 0.625f, 0.875f}; -}; - -template <> -struct MultisampleTraits<SWR_MULTISAMPLE_4X, true> -{ - INLINE static float X(uint32_t sampleNum) { return 0.5f; }; - INLINE static float Y(uint32_t sampleNum) { return 0.5f; }; - INLINE static simdscalari FullSampleMask() - { - static const simdscalari mask = _simd_set1_epi32(0xF); - return mask; - } - - static const uint32_t numSamples = 4; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X; - static constexpr uint32_t samplePosXi[4] = {0x80, 0x80, 0x80, 0x80}; - static constexpr uint32_t samplePosYi[4] = {0x80, 0x80, 0x80, 0x80}; - static constexpr float samplePosX[4] = {0.5f, 0.5f, 0.5f, 0.5f}; - static constexpr float samplePosY[4] = {0.5f, 0.5f, 0.5f, 0.5f}; -}; - -template <> -struct MultisampleTraits<SWR_MULTISAMPLE_8X, false> -{ - INLINE static float X(uint32_t sampleNum) - { - SWR_ASSERT(sampleNum < numSamples); - return samplePosX[sampleNum]; - }; - INLINE static float Y(uint32_t sampleNum) - { - SWR_ASSERT(sampleNum < numSamples); - return samplePosY[sampleNum]; - }; - INLINE static simdscalari FullSampleMask() - { - static const simdscalari mask = _simd_set1_epi32(0xFF); - return mask; - } - - static const uint32_t numSamples = 8; - static const uint32_t numCoverageSamples = 8; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X; - static constexpr uint32_t samplePosXi[8] = {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0}; - static constexpr uint32_t samplePosYi[8] = {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10}; - static constexpr float samplePosX[8] = { - 0.5625f, 0.4375f, 0.8125f, 0.3125f, 0.1875f, 0.0625f, 0.6875f, 0.9375f}; - static constexpr float samplePosY[8] = { - 0.3125f, 0.6875f, 0.5625f, 0.1875f, 0.8125f, 0.4375f, 0.9375f, 0.0625f}; -}; - -template <> -struct MultisampleTraits<SWR_MULTISAMPLE_8X, true> -{ - INLINE static float X(uint32_t sampleNum) { return 0.5f; }; - INLINE static float Y(uint32_t sampleNum) { return 0.5f; }; - INLINE static simdscalari FullSampleMask() - { - static const simdscalari mask = _simd_set1_epi32(0xFF); - return mask; - } - static const uint32_t numSamples = 8; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X; - static constexpr uint32_t samplePosXi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - static constexpr uint32_t samplePosYi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - static constexpr float samplePosX[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; - static constexpr float samplePosY[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; -}; - -template <> -struct MultisampleTraits<SWR_MULTISAMPLE_16X, false> -{ - INLINE static float X(uint32_t sampleNum) - { - SWR_ASSERT(sampleNum < numSamples); - return samplePosX[sampleNum]; - }; - INLINE static float Y(uint32_t sampleNum) - { - SWR_ASSERT(sampleNum < numSamples); - return samplePosY[sampleNum]; - }; - INLINE static simdscalari FullSampleMask() - { - static const simdscalari mask = _simd_set1_epi32(0xFFFF); - return mask; - } - - static const uint32_t numSamples = 16; - static const uint32_t numCoverageSamples = 16; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X; - static constexpr uint32_t samplePosXi[16] = {0x90, - 0x70, - 0x50, - 0xC0, - 0x30, - 0xA0, - 0xD0, - 0xB0, - 0x60, - 0x80, - 0x40, - 0x20, - 0x00, - 0xF0, - 0xE0, - 0x10}; - static constexpr uint32_t samplePosYi[16] = {0x90, - 0x50, - 0xA0, - 0x70, - 0x60, - 0xD0, - 0xB0, - 0x30, - 0xE0, - 0x10, - 0x20, - 0xC0, - 0x80, - 0x40, - 0xF0, - 0x00}; - static constexpr float samplePosX[16] = {0.5625f, - 0.4375f, - 0.3125f, - 0.7500f, - 0.1875f, - 0.6250f, - 0.8125f, - 0.6875f, - 0.3750f, - 0.5000f, - 0.2500f, - 0.1250f, - 0.0000f, - 0.9375f, - 0.8750f, - 0.0625f}; - static constexpr float samplePosY[16] = {0.5625f, - 0.3125f, - 0.6250f, - 0.4375f, - 0.3750f, - 0.8125f, - 0.6875f, - 0.1875f, - 0.8750f, - 0.0625f, - 0.1250f, - 0.7500f, - 0.5000f, - 0.2500f, - 0.9375f, - 0.0000f}; -}; - -template <> -struct MultisampleTraits<SWR_MULTISAMPLE_16X, true> -{ - INLINE static float X(uint32_t sampleNum) { return 0.5f; }; - INLINE static float Y(uint32_t sampleNum) { return 0.5f; }; - INLINE static simdscalari FullSampleMask() - { - static const simdscalari mask = _simd_set1_epi32(0xFFFF); - return mask; - } - static const uint32_t numSamples = 16; - static const uint32_t numCoverageSamples = 1; - static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X; - static constexpr uint32_t samplePosXi[16] = {0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80}; - static constexpr uint32_t samplePosYi[16] = {0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80, - 0x80}; - static constexpr float samplePosX[16] = {0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f}; - static constexpr float samplePosY[16] = {0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f, - 0.5f}; -}; - -INLINE -bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount, - const SWR_MULTISAMPLE_POS& samplePos) -{ - // detect if we're using standard or center sample patterns - const uint32_t *standardPosX, *standardPosY; - switch (sampleCount) - { - case SWR_MULTISAMPLE_1X: - standardPosX = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosXi; - standardPosY = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosYi; - break; - case SWR_MULTISAMPLE_2X: - standardPosX = MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi; - standardPosY = MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi; - break; - case SWR_MULTISAMPLE_4X: - standardPosX = MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi; - standardPosY = MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi; - break; - case SWR_MULTISAMPLE_8X: - standardPosX = MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi; - standardPosY = MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi; - break; - case SWR_MULTISAMPLE_16X: - standardPosX = MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi; - standardPosY = MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi; - break; - default: - break; - } - - // scan sample pattern for standard or center - uint32_t numSamples = GetNumSamples(sampleCount); - bool bIsStandard = true; - if (numSamples > 1) - { - for (uint32_t i = 0; i < numSamples; i++) - { - bIsStandard = - (standardPosX[i] == samplePos.Xi(i)) || (standardPosY[i] == samplePos.Yi(i)); - if (!bIsStandard) - break; - } - } - return !bIsStandard; -} diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h deleted file mode 100644 index adfc1414bae..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ /dev/null @@ -1,1676 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file pa.h - * - * @brief Definitions for primitive assembly. - * N primitives are assembled at a time, where N is the SIMD width. - * A state machine, that is specific for a given topology, drives the - * assembly of vertices into triangles. - * - ******************************************************************************/ -#pragma once - -#include "frontend.h" - -struct PA_STATE -{ -#if USE_SIMD16_FRONTEND - enum - { - SIMD_WIDTH = KNOB_SIMD16_WIDTH, - SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2, - SIMD_WIDTH_LOG2 = 4 - }; - - typedef simd16mask SIMDMASK; - - typedef simd16scalar SIMDSCALAR; - typedef simd16vector SIMDVECTOR; - typedef simd16vertex SIMDVERTEX; - - typedef simd16scalari SIMDSCALARI; - -#else - enum - { - SIMD_WIDTH = KNOB_SIMD_WIDTH, - SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2, - SIMD_WIDTH_LOG2 = 3 - }; - - typedef simdmask SIMDMASK; - - typedef simdscalar SIMDSCALAR; - typedef simdvector SIMDVECTOR; - typedef simdvertex SIMDVERTEX; - - typedef simdscalari SIMDSCALARI; - -#endif - DRAW_CONTEXT* pDC{nullptr}; // draw context - uint8_t* pStreamBase{nullptr}; // vertex stream - uint32_t streamSizeInVerts{0}; // total size of the input stream in verts - uint32_t vertexStride{0}; // stride of a vertex in simdvector units - - // The topology the binner will use. In some cases the FE changes the topology from the api - // state. - PRIMITIVE_TOPOLOGY binTopology{TOP_UNKNOWN}; - -#if ENABLE_AVX512_SIMD16 - bool useAlternateOffset{false}; -#endif - - bool viewportArrayActive{false}; - bool rtArrayActive{false}; - uint32_t numVertsPerPrim{0}; - - PA_STATE() {} - PA_STATE(DRAW_CONTEXT* in_pDC, - uint8_t* in_pStreamBase, - uint32_t in_streamSizeInVerts, - uint32_t in_vertexStride, - uint32_t in_numVertsPerPrim) : - pDC(in_pDC), - pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), - vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) - { - } - - virtual bool HasWork() = 0; - virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; -#if ENABLE_AVX512_SIMD16 - virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0; -#endif - virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0; -#if ENABLE_AVX512_SIMD16 - virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0; -#endif - virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0; - virtual bool NextPrim() = 0; - virtual SIMDVERTEX& GetNextVsOutput() = 0; - virtual bool GetNextStreamOutput() = 0; - virtual SIMDMASK& GetNextVsIndices() = 0; - virtual uint32_t NumPrims() = 0; - virtual void Reset() = 0; - virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0; -}; - -// The Optimized PA is a state machine that assembles triangles from vertex shader simd -// output. Here is the sequence -// 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd). -// 2. Execute PA function to assemble and bin triangles. -// a. The PA function is a set of functions that collectively make up the -// state machine for a given topology. -// 1. We use a state index to track which PA function to call. -// b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle. -// 1. We call this the current and previous simd vertex. -// 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In -// order to assemble the second triangle, for a triangle list, we'll need the -// last vertex from the previous simd and the first 2 vertices from the current -// simd. -// 3. At times the PA can assemble multiple triangles from the 2 simd vertices. -// -// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without -// cuts -struct PA_STATE_OPT : public PA_STATE -{ - uint32_t numPrims{0}; // Total number of primitives for draw. - uint32_t numPrimsComplete{0}; // Total number of complete primitives. - - uint32_t numSimdPrims{0}; // Number of prims in current simd. - - uint32_t cur{0}; // index to current VS output. - uint32_t prev{0}; // index to prev VS output. Not really needed in the state. - const uint32_t first{0}; // index to first VS output. Used for tri fan and line loop. - - uint32_t counter{0}; // state counter - bool reset{false}; // reset state - - uint32_t primIDIncr{0}; // how much to increment for each vector (typically vector / {1, 2}) - SIMDSCALARI primID; - - typedef bool (*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -#if ENABLE_AVX512_SIMD16 - typedef bool (*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -#endif - typedef void (*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, - uint32_t slot, - uint32_t primIndex, - simd4scalar verts[]); - - PFN_PA_FUNC pfnPaFunc{nullptr}; // PA state machine function for assembling 4 triangles. -#if ENABLE_AVX512_SIMD16 - PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{nullptr}; -#endif - PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ - nullptr}; // PA state machine function for assembling single triangle. - PFN_PA_FUNC pfnPaFuncReset{nullptr}; // initial state to set on reset -#if ENABLE_AVX512_SIMD16 - PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{nullptr}; -#endif - - // state used to advance the PA when Next is called - PFN_PA_FUNC pfnPaNextFunc{nullptr}; -#if ENABLE_AVX512_SIMD16 - PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{nullptr}; -#endif - uint32_t nextNumSimdPrims{0}; - uint32_t nextNumPrimsIncrement{0}; - bool nextReset{false}; - bool isStreaming{false}; - - SIMDMASK junkIndices{0}; // temporary index store for unused virtual function - - PA_STATE_OPT() {} - PA_STATE_OPT(DRAW_CONTEXT* pDC, - uint32_t numPrims, - uint8_t* pStream, - uint32_t streamSizeInVerts, - uint32_t vertexStride, - bool in_isStreaming, - uint32_t numVertsPerPrim, - PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); - - bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; } - - simdvector& GetSimdVector(uint32_t index, uint32_t slot) - { - SWR_ASSERT(slot < vertexStride); - uint32_t offset = index * vertexStride + slot; - simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset]; - return vertexSlot; - } - -#if ENABLE_AVX512_SIMD16 - simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) - { - SWR_ASSERT(slot < vertexStride); - uint32_t offset = index * vertexStride + slot; - simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset]; - return vertexSlot; - } - -#endif - // Assembles 4 triangles. Each simdvector is a single vertex from 4 - // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle. - bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); } - -#if ENABLE_AVX512_SIMD16 - bool Assemble(uint32_t slot, simd16vector verts[]) - { - return this->pfnPaFunc_simd16(*this, slot, verts); - } - -#endif - // Assembles 1 primitive. Each simdscalar is a vertex (xyzw). - void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) - { - return this->pfnPaSingleFunc(*this, slot, primIndex, verts); - } - - bool NextPrim() - { - this->pfnPaFunc = this->pfnPaNextFunc; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16; -#endif - this->numSimdPrims = this->nextNumSimdPrims; - this->numPrimsComplete += this->nextNumPrimsIncrement; - this->reset = this->nextReset; - - if (this->isStreaming) - { - this->reset = false; - } - - bool morePrims = false; - - if (this->numSimdPrims > 0) - { - morePrims = true; - this->numSimdPrims--; - } - else - { - this->counter = (this->reset) ? 0 : (this->counter + 1); - this->reset = false; - } - - if (!HasWork()) - { - morePrims = false; // no more to do - } - - return morePrims; - } - - SIMDVERTEX& GetNextVsOutput() - { - const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH; - - // increment cur and prev indices - if (counter < numSimdVerts) - { - // prev undefined for first state - prev = cur; - cur = counter; - } - else - { - // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in - // the buffer - uint32_t temp = prev; - - prev = cur; - cur = temp; - } - - SWR_ASSERT(cur < numSimdVerts); - SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride]; - - return *(SIMDVERTEX*)pVertex; - } - - SIMDMASK& GetNextVsIndices() - { - // unused in optimized PA, pass tmp buffer back - return junkIndices; - } - - bool GetNextStreamOutput() - { - this->prev = this->cur; - this->cur = this->counter; - - return HasWork(); - } - - uint32_t NumPrims() - { - return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) - ? (SIMD_WIDTH - - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) - : SIMD_WIDTH; - } - - void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, - PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, - uint32_t numSimdPrims = 0, - uint32_t numPrimsIncrement = 0, - bool reset = false) - { - this->pfnPaNextFunc = pfnPaNextFunc; - this->nextNumSimdPrims = numSimdPrims; - this->nextNumPrimsIncrement = numPrimsIncrement; - this->nextReset = reset; - - this->pfnPaSingleFunc = pfnPaNextSingleFunc; - } - -#if ENABLE_AVX512_SIMD16 - void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, - PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, - PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, - uint32_t numSimdPrims = 0, - uint32_t numPrimsIncrement = 0, - bool reset = false) - { - this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16; - this->pfnPaNextFunc = pfnPaNextFunc; - this->nextNumSimdPrims = numSimdPrims; - this->nextNumPrimsIncrement = numPrimsIncrement; - this->nextReset = reset; - - this->pfnPaSingleFunc = pfnPaNextSingleFunc; - } - -#endif - void Reset() - { -#if ENABLE_AVX512_SIMD16 - useAlternateOffset = false; - -#endif - this->pfnPaFunc = this->pfnPaFuncReset; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16; -#endif - this->numPrimsComplete = 0; - this->numSimdPrims = 0; - this->cur = 0; - this->prev = 0; - this->counter = 0; - this->reset = false; - } - - SIMDSCALARI GetPrimID(uint32_t startID) - { -#if USE_SIMD16_FRONTEND - return _simd16_add_epi32( - this->primID, - _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH))); -#else - return _simd_add_epi32( - this->primID, - _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH))); -#endif - } -}; - -// helper C wrappers to avoid having to rewrite all the PA topology state functions -INLINE void SetNextPaState(PA_STATE_OPT& pa, - PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, - PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, - uint32_t numSimdPrims = 0, - uint32_t numPrimsIncrement = 0, - bool reset = false) -{ - return pa.SetNextState( - pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); -} - -#if ENABLE_AVX512_SIMD16 -INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, - PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, - PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, - PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, - uint32_t numSimdPrims = 0, - uint32_t numPrimsIncrement = 0, - bool reset = false) -{ - return pa.SetNextState_simd16(pfnPaNextFunc_simd16, - pfnPaNextFunc, - pfnPaNextSingleFunc, - numSimdPrims, - numPrimsIncrement, - reset); -} - -#endif -INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot) -{ - return pa.GetSimdVector(index, slot); -} - -#if ENABLE_AVX512_SIMD16 -INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot) -{ - return pa.GetSimdVector_simd16(index, slot); -} - -#endif -// Cut-aware primitive assembler. -struct PA_STATE_CUT : public PA_STATE -{ - SIMDMASK* pCutIndices{nullptr}; // cut indices buffer, 1 bit per vertex - uint32_t numVerts{0}; // number of vertices available in buffer store - uint32_t numAttribs{0}; // number of attributes - int32_t numRemainingVerts{0}; // number of verts remaining to be assembled - uint32_t numVertsToAssemble{0}; // total number of verts to assemble for the draw -#if ENABLE_AVX512_SIMD16 - OSALIGNSIMD16(uint32_t) - indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather -#else - OSALIGNSIMD(uint32_t) - indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather -#endif - SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd - uint32_t numPrimsAssembled{0}; // number of primitives that are fully assembled - uint32_t headVertex{0}; // current unused vertex slot in vertex buffer store - uint32_t tailVertex{0}; // beginning vertex currently assembling - uint32_t curVertex{0}; // current unprocessed vertex - uint32_t startPrimId{0}; // starting prim id - SIMDSCALARI vPrimId; // vector of prim ID - bool needOffsets{false}; // need to compute gather offsets for current SIMD - uint32_t vertsPerPrim{0}; - bool processCutVerts{ - false}; // vertex indices with cuts should be processed as normal, otherwise they - // are ignored. Fetch shader sends invalid verts on cuts that should be ignored - // while the GS sends valid verts for every index - - simdvector junkVector; // junk simdvector for unimplemented API -#if ENABLE_AVX512_SIMD16 - simd16vector junkVector_simd16; // junk simd16vector for unimplemented API -#endif - - // Topology state tracking - uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; - uint32_t curIndex{0}; - bool reverseWinding{false}; // indicates reverse winding for strips - int32_t adjExtraVert{0}; // extra vert uses for tristrip w/ adj - - typedef void (PA_STATE_CUT::*PFN_PA_FUNC)(uint32_t vert, bool finish); - PFN_PA_FUNC pfnPa{nullptr}; // per-topology function that processes a single vert - - PA_STATE_CUT() {} - PA_STATE_CUT(DRAW_CONTEXT* pDC, - uint8_t* in_pStream, - uint32_t in_streamSizeInVerts, - uint32_t in_vertexStride, - SIMDMASK* in_pIndices, - uint32_t in_numVerts, - uint32_t in_numAttribs, - PRIMITIVE_TOPOLOGY topo, - bool in_processCutVerts, - uint32_t in_numVertsPerPrim) : - PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim) - { - numVerts = in_streamSizeInVerts; - numAttribs = in_numAttribs; - binTopology = topo; - needOffsets = false; - processCutVerts = in_processCutVerts; - - numVertsToAssemble = numRemainingVerts = in_numVerts; - numPrimsAssembled = 0; - headVertex = tailVertex = curVertex = 0; - - curIndex = 0; - pCutIndices = in_pIndices; - memset(indices, 0, sizeof(indices)); -#if USE_SIMD16_FRONTEND - vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); -#else - vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); -#endif - reverseWinding = false; - adjExtraVert = -1; - - bool gsEnabled = pDC->pState->state.gsState.gsEnable; - vertsPerPrim = NumVertsPerPrim(topo, gsEnabled); - - switch (topo) - { - case TOP_TRIANGLE_LIST: - pfnPa = &PA_STATE_CUT::ProcessVertTriList; - break; - case TOP_TRI_LIST_ADJ: - pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj - : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; - break; - case TOP_TRIANGLE_STRIP: - pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; - break; - case TOP_TRI_STRIP_ADJ: - if (gsEnabled) - { - pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<true>; - } - else - { - pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<false>; - } - break; - - case TOP_POINT_LIST: - pfnPa = &PA_STATE_CUT::ProcessVertPointList; - break; - case TOP_LINE_LIST: - pfnPa = &PA_STATE_CUT::ProcessVertLineList; - break; - case TOP_LINE_LIST_ADJ: - pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj - : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; - break; - case TOP_LINE_STRIP: - pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; - break; - case TOP_LISTSTRIP_ADJ: - pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj - : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; - break; - case TOP_RECT_LIST: - pfnPa = &PA_STATE_CUT::ProcessVertRectList; - break; - default: - assert(0 && "Unimplemented topology"); - } - } - - SIMDVERTEX& GetNextVsOutput() - { - uint32_t vertexIndex = this->headVertex / SIMD_WIDTH; - this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts; - this->needOffsets = true; - SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride]; - - return *(SIMDVERTEX*)pVertex; - } - - SIMDMASK& GetNextVsIndices() - { - uint32_t vertexIndex = this->headVertex / SIMD_WIDTH; - SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex; - return *pCurCutIndex; - } - - simdvector& GetSimdVector(uint32_t index, uint32_t slot) - { - // unused - SWR_ASSERT(0 && "Not implemented"); - return junkVector; - } - -#if ENABLE_AVX512_SIMD16 - simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) - { - // unused - SWR_ASSERT(0 && "Not implemented"); - return junkVector_simd16; - } - -#endif - bool GetNextStreamOutput() - { - this->headVertex += SIMD_WIDTH; - this->needOffsets = true; - return HasWork(); - } - - SIMDSCALARI GetPrimID(uint32_t startID) - { -#if USE_SIMD16_FRONTEND - return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId); -#else - return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId); -#endif - } - - void Reset() - { -#if ENABLE_AVX512_SIMD16 - useAlternateOffset = false; - -#endif - this->numRemainingVerts = this->numVertsToAssemble; - this->numPrimsAssembled = 0; - this->curIndex = 0; - this->curVertex = 0; - this->tailVertex = 0; - this->headVertex = 0; - this->reverseWinding = false; - this->adjExtraVert = -1; -#if USE_SIMD16_FRONTEND - this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); -#else - this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); -#endif - } - - bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; } - - bool IsVertexStoreFull() - { - return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex; - } - - void RestartTopology() - { - this->curIndex = 0; - this->reverseWinding = false; - this->adjExtraVert = -1; - } - - bool IsCutIndex(uint32_t vertex) - { - uint32_t vertexIndex = vertex / SIMD_WIDTH; - uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1); - return CheckBit(this->pCutIndices[vertexIndex], vertexOffset); - } - - // iterates across the unprocessed verts until we hit the end or we - // have assembled SIMD prims - void ProcessVerts() - { - while (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0 && - this->curVertex != this->headVertex) - { - // if cut index, restart topology - if (IsCutIndex(this->curVertex)) - { - if (this->processCutVerts) - { - (this->*pfnPa)(this->curVertex, false); - } - // finish off tri strip w/ adj before restarting topo - if (this->adjExtraVert != -1) - { - (this->*pfnPa)(this->curVertex, true); - } - RestartTopology(); - } - else - { - (this->*pfnPa)(this->curVertex, false); - } - - this->curVertex++; - if (this->curVertex >= this->numVerts) - { - this->curVertex = 0; - } - this->numRemainingVerts--; - } - - // special case last primitive for tri strip w/ adj - if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && - this->adjExtraVert != -1) - { - (this->*pfnPa)(this->curVertex, true); - } - } - - void Advance() - { - // done with current batch - // advance tail to the current unsubmitted vertex - this->tailVertex = this->curVertex; - this->numPrimsAssembled = 0; -#if USE_SIMD16_FRONTEND - this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH)); -#else - this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH)); -#endif - } - - bool NextPrim() - { - // if we've assembled enough prims, we can advance to the next set of verts - if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0) - { - Advance(); - } - return false; - } - - void ComputeOffsets() - { - for (uint32_t v = 0; v < this->vertsPerPrim; ++v) - { - uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR); - SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0]; - - // step to simdvertex batch - const uint32_t simdShift = SIMD_WIDTH_LOG2; -#if USE_SIMD16_FRONTEND - SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift); - this->vOffsets[v] = - _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes)); -#else - SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift); - this->vOffsets[v] = - _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes)); -#endif - - // step to index - const uint32_t simdMask = SIMD_WIDTH - 1; -#if USE_SIMD16_FRONTEND - SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask)); - this->vOffsets[v] = _simd16_add_epi32( - this->vOffsets[v], - _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float)))); -#else - SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask)); - this->vOffsets[v] = - _simd_add_epi32(this->vOffsets[v], - _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float)))); -#endif - } - } - - bool Assemble(uint32_t slot, simdvector* verts) - { - // process any outstanding verts - ProcessVerts(); - - // return false if we don't have enough prims assembled - if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0) - { - return false; - } - - // cache off gather offsets given the current SIMD set of indices the first time we get an - // assemble - if (this->needOffsets) - { - ComputeOffsets(); - this->needOffsets = false; - } - - for (uint32_t v = 0; v < this->vertsPerPrim; ++v) - { - SIMDSCALARI offsets = this->vOffsets[v]; - - // step to attribute -#if USE_SIMD16_FRONTEND - offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR))); -#else - offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR))); -#endif - - float* pBase = (float*)this->pStreamBase; - for (uint32_t c = 0; c < 4; ++c) - { -#if USE_SIMD16_FRONTEND - simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1); - - // Assigning to a temporary first to avoid an MSVC 2017 compiler bug - simdscalar t = - useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); - verts[v].v[c] = t; -#else - verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1); -#endif - - // move base to next component - pBase += SIMD_WIDTH; - } - } - - // compute the implied 4th vertex, v3 - if (this->binTopology == TOP_RECT_LIST) - { - for (uint32_t c = 0; c < 4; ++c) - { - // v1, v3 = v1 + v2 - v0, v2 - // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2] - simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]); - temp = _simd16_sub_ps(temp, verts[1].v[c]); - temp = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010 - verts[1].v[c] = _simd16_extract_ps(temp, 0); - } - } - - return true; - } - -#if ENABLE_AVX512_SIMD16 - bool Assemble(uint32_t slot, simd16vector verts[]) - { - // process any outstanding verts - ProcessVerts(); - - // return false if we don't have enough prims assembled - if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0) - { - return false; - } - - // cache off gather offsets given the current SIMD set of indices the first time we get an - // assemble - if (this->needOffsets) - { - ComputeOffsets(); - this->needOffsets = false; - } - - for (uint32_t v = 0; v < this->vertsPerPrim; ++v) - { - SIMDSCALARI offsets = this->vOffsets[v]; - - // step to attribute -#if USE_SIMD16_FRONTEND - offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR))); -#else - offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector))); -#endif - - float* pBase = (float*)this->pStreamBase; - for (uint32_t c = 0; c < 4; ++c) - { -#if USE_SIMD16_FRONTEND - verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1); -#else - verts[v].v[c] = _simd16_insert_ps( - _simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0); -#endif - - // move base to next component - pBase += SIMD_WIDTH; - } - } - - // compute the implied 4th vertex, v3 - if (this->binTopology == TOP_RECT_LIST) - { - for (uint32_t c = 0; c < 4; ++c) - { - // v1, v3 = v1 + v2 - v0, v2 - // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2] - simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]); - temp = _simd16_sub_ps(temp, verts[1].v[c]); - verts[1].v[c] = - _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010 - } - } - - return true; - } - -#endif - void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3]) - { - // move to slot - for (uint32_t v = 0; v < this->vertsPerPrim; ++v) - { - uint32_t* pOffset = (uint32_t*)&this->vOffsets[v]; -#if USE_SIMD16_FRONTEND - uint32_t offset = - useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex]; -#else - uint32_t offset = pOffset[triIndex]; -#endif - offset += sizeof(SIMDVECTOR) * slot; - float* pVert = (float*)&tri[v]; - for (uint32_t c = 0; c < 4; ++c) - { - float* pComponent = (float*)(this->pStreamBase + offset); - pVert[c] = *pComponent; - offset += SIMD_WIDTH * sizeof(float); - } - } - - // compute the implied 4th vertex, v3 - if ((this->binTopology == TOP_RECT_LIST) && (triIndex % 2 == 1)) - { - // v1, v3 = v1 + v2 - v0, v2 - // v1 stored in tri[0], v0 stored in tri[1], v2 stored in tri[2] - float* pVert0 = (float*)&tri[1]; - float* pVert1 = (float*)&tri[0]; - float* pVert2 = (float*)&tri[2]; - float* pVert3 = (float*)&tri[1]; - for (uint32_t c = 0; c < 4; ++c) - { - pVert3[c] = pVert1[c] + pVert2[c] - pVert0[c]; - } - } - } - - uint32_t NumPrims() { return this->numPrimsAssembled; } - - // Per-topology functions - void ProcessVertTriStrip(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 3) - { - // assembled enough verts for prim, add to gather indices - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - if (reverseWinding) - { - this->indices[1][this->numPrimsAssembled] = this->vert[2]; - this->indices[2][this->numPrimsAssembled] = this->vert[1]; - } - else - { - this->indices[1][this->numPrimsAssembled] = this->vert[1]; - this->indices[2][this->numPrimsAssembled] = this->vert[2]; - } - - // increment numPrimsAssembled - this->numPrimsAssembled++; - - // set up next prim state - this->vert[0] = this->vert[1]; - this->vert[1] = this->vert[2]; - this->curIndex = 2; - this->reverseWinding ^= 1; - } - } - - template <bool gsEnabled> - void AssembleTriStripAdj() - { - if (!gsEnabled) - { - this->vert[1] = this->vert[2]; - this->vert[2] = this->vert[4]; - - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->indices[1][this->numPrimsAssembled] = this->vert[1]; - this->indices[2][this->numPrimsAssembled] = this->vert[2]; - - this->vert[4] = this->vert[2]; - this->vert[2] = this->vert[1]; - } - else - { - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->indices[1][this->numPrimsAssembled] = this->vert[1]; - this->indices[2][this->numPrimsAssembled] = this->vert[2]; - this->indices[3][this->numPrimsAssembled] = this->vert[3]; - this->indices[4][this->numPrimsAssembled] = this->vert[4]; - this->indices[5][this->numPrimsAssembled] = this->vert[5]; - } - this->numPrimsAssembled++; - } - - template <bool gsEnabled> - void ProcessVertTriStripAdj(uint32_t index, bool finish) - { - // handle last primitive of tristrip - if (finish && this->adjExtraVert != -1) - { - this->vert[3] = this->adjExtraVert; - AssembleTriStripAdj<gsEnabled>(); - this->adjExtraVert = -1; - return; - } - - switch (this->curIndex) - { - case 0: - case 1: - case 2: - case 4: - this->vert[this->curIndex] = index; - this->curIndex++; - break; - case 3: - this->vert[5] = index; - this->curIndex++; - break; - case 5: - if (this->adjExtraVert == -1) - { - this->adjExtraVert = index; - } - else - { - this->vert[3] = index; - if (!gsEnabled) - { - AssembleTriStripAdj<gsEnabled>(); - - uint32_t nextTri[6]; - if (this->reverseWinding) - { - nextTri[0] = this->vert[4]; - nextTri[1] = this->vert[0]; - nextTri[2] = this->vert[2]; - nextTri[4] = this->vert[3]; - nextTri[5] = this->adjExtraVert; - } - else - { - nextTri[0] = this->vert[2]; - nextTri[1] = this->adjExtraVert; - nextTri[2] = this->vert[3]; - nextTri[4] = this->vert[4]; - nextTri[5] = this->vert[0]; - } - for (uint32_t i = 0; i < 6; ++i) - { - this->vert[i] = nextTri[i]; - } - - this->adjExtraVert = -1; - this->reverseWinding ^= 1; - } - else - { - this->curIndex++; - } - } - break; - case 6: - SWR_ASSERT(this->adjExtraVert != -1, "Algorithm failure!"); - AssembleTriStripAdj<gsEnabled>(); - - uint32_t nextTri[6]; - if (this->reverseWinding) - { - nextTri[0] = this->vert[4]; - nextTri[1] = this->vert[0]; - nextTri[2] = this->vert[2]; - nextTri[4] = this->vert[3]; - nextTri[5] = this->adjExtraVert; - } - else - { - nextTri[0] = this->vert[2]; - nextTri[1] = this->adjExtraVert; - nextTri[2] = this->vert[3]; - nextTri[4] = this->vert[4]; - nextTri[5] = this->vert[0]; - } - for (uint32_t i = 0; i < 6; ++i) - { - this->vert[i] = nextTri[i]; - } - this->reverseWinding ^= 1; - this->adjExtraVert = index; - this->curIndex--; - break; - } - } - - void ProcessVertTriList(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 3) - { - // assembled enough verts for prim, add to gather indices - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->indices[1][this->numPrimsAssembled] = this->vert[1]; - this->indices[2][this->numPrimsAssembled] = this->vert[2]; - - // increment numPrimsAssembled - this->numPrimsAssembled++; - - // set up next prim state - this->curIndex = 0; - } - } - - void ProcessVertTriListAdj(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 6) - { - // assembled enough verts for prim, add to gather indices - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->indices[1][this->numPrimsAssembled] = this->vert[1]; - this->indices[2][this->numPrimsAssembled] = this->vert[2]; - this->indices[3][this->numPrimsAssembled] = this->vert[3]; - this->indices[4][this->numPrimsAssembled] = this->vert[4]; - this->indices[5][this->numPrimsAssembled] = this->vert[5]; - - // increment numPrimsAssembled - this->numPrimsAssembled++; - - // set up next prim state - this->curIndex = 0; - } - } - - void ProcessVertTriListAdjNoGs(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 6) - { - // assembled enough verts for prim, add to gather indices - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->indices[1][this->numPrimsAssembled] = this->vert[2]; - this->indices[2][this->numPrimsAssembled] = this->vert[4]; - - // increment numPrimsAssembled - this->numPrimsAssembled++; - - // set up next prim state - this->curIndex = 0; - } - } - - void ProcessVertLineList(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 2) - { - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->indices[1][this->numPrimsAssembled] = this->vert[1]; - - this->numPrimsAssembled++; - this->curIndex = 0; - } - } - - void ProcessVertLineStrip(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 2) - { - // assembled enough verts for prim, add to gather indices - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->indices[1][this->numPrimsAssembled] = this->vert[1]; - - // increment numPrimsAssembled - this->numPrimsAssembled++; - - // set up next prim state - this->vert[0] = this->vert[1]; - this->curIndex = 1; - } - } - - void ProcessVertLineStripAdj(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 4) - { - // assembled enough verts for prim, add to gather indices - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->indices[1][this->numPrimsAssembled] = this->vert[1]; - this->indices[2][this->numPrimsAssembled] = this->vert[2]; - this->indices[3][this->numPrimsAssembled] = this->vert[3]; - - // increment numPrimsAssembled - this->numPrimsAssembled++; - - // set up next prim state - this->vert[0] = this->vert[1]; - this->vert[1] = this->vert[2]; - this->vert[2] = this->vert[3]; - this->curIndex = 3; - } - } - - void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 4) - { - // assembled enough verts for prim, add to gather indices - this->indices[0][this->numPrimsAssembled] = this->vert[1]; - this->indices[1][this->numPrimsAssembled] = this->vert[2]; - - // increment numPrimsAssembled - this->numPrimsAssembled++; - - // set up next prim state - this->vert[0] = this->vert[1]; - this->vert[1] = this->vert[2]; - this->vert[2] = this->vert[3]; - this->curIndex = 3; - } - } - - void ProcessVertLineListAdj(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 4) - { - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->indices[1][this->numPrimsAssembled] = this->vert[1]; - this->indices[2][this->numPrimsAssembled] = this->vert[2]; - this->indices[3][this->numPrimsAssembled] = this->vert[3]; - - this->numPrimsAssembled++; - this->curIndex = 0; - } - } - - void ProcessVertLineListAdjNoGs(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 4) - { - this->indices[0][this->numPrimsAssembled] = this->vert[1]; - this->indices[1][this->numPrimsAssembled] = this->vert[2]; - - this->numPrimsAssembled++; - this->curIndex = 0; - } - } - - void ProcessVertPointList(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 1) - { - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->numPrimsAssembled++; - this->curIndex = 0; - } - } - - void ProcessVertRectList(uint32_t index, bool finish) - { - this->vert[this->curIndex] = index; - this->curIndex++; - if (this->curIndex == 3) - { - // assembled enough verts for prim, add to gather indices - this->indices[0][this->numPrimsAssembled] = this->vert[0]; - this->indices[1][this->numPrimsAssembled] = this->vert[1]; - this->indices[2][this->numPrimsAssembled] = this->vert[2]; - - // second triangle in the rectangle - // v1, v3 = v1 + v2 - v0, v2 - this->indices[0][this->numPrimsAssembled + 1] = this->vert[1]; - this->indices[1][this->numPrimsAssembled + 1] = this->vert[0]; - this->indices[2][this->numPrimsAssembled + 1] = this->vert[2]; - - // increment numPrimsAssembled - this->numPrimsAssembled += 2; - - // set up next prim state - this->curIndex = 0; - } - } -}; - -// Primitive Assembly for data output from the DomainShader. -struct PA_TESS : PA_STATE -{ - PA_TESS(DRAW_CONTEXT* in_pDC, - const SIMDSCALAR* in_pVertData, - uint32_t in_attributeStrideInVectors, - uint32_t in_vertexStride, - uint32_t in_numAttributes, - uint32_t* (&in_ppIndices)[3], - uint32_t in_numPrims, - PRIMITIVE_TOPOLOGY in_binTopology, - uint32_t numVertsPerPrim, - bool SOA = true) : - - PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim), - m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors), - m_numAttributes(in_numAttributes), m_numPrims(in_numPrims), m_SOA(SOA) - { -#if USE_SIMD16_FRONTEND - m_vPrimId = _simd16_setzero_si(); -#else - m_vPrimId = _simd_setzero_si(); -#endif - binTopology = in_binTopology; - m_ppIndices[0] = in_ppIndices[0]; - m_ppIndices[1] = in_ppIndices[1]; - m_ppIndices[2] = in_ppIndices[2]; - - switch (binTopology) - { - case TOP_POINT_LIST: - m_numVertsPerPrim = 1; - break; - - case TOP_LINE_LIST: - m_numVertsPerPrim = 2; - break; - - case TOP_TRIANGLE_LIST: - m_numVertsPerPrim = 3; - break; - - default: - SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__); - break; - } - } - - bool HasWork() { return m_numPrims != 0; } - - simdvector& GetSimdVector(uint32_t index, uint32_t slot) - { - SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__); - return junkVector; - } - -#if ENABLE_AVX512_SIMD16 - simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) - { - SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__); - return junkVector_simd16; - } - -#endif - static SIMDSCALARI GenPrimMask(uint32_t numPrims) - { - SWR_ASSERT(numPrims <= SIMD_WIDTH); -#if USE_SIMD16_FRONTEND - static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - - return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]); -#else - static const OSALIGNLINE(int32_t) - maskGen[SIMD_WIDTH * 2] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; - - return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]); -#endif - } - - bool Assemble(uint32_t slot, simdvector verts[]) - { - SWR_ASSERT(slot < m_numAttributes); - - uint32_t numPrimsToAssemble = PA_TESS::NumPrims(); - if (0 == numPrimsToAssemble) - { - return false; - } - - SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble); - - const float* pBaseAttrib; - if (m_SOA) - { - pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; - } - else - { - const float* pVertData = (const float*)m_pVertexData; - pBaseAttrib = pVertData + slot * 4; - } - - for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) - { -#if USE_SIMD16_FRONTEND - SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]); -#else - SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]); -#endif - - const float* pBase = pBaseAttrib; - for (uint32_t c = 0; c < 4; ++c) - { -#if USE_SIMD16_FRONTEND - simd16scalar temp = - _simd16_mask_i32gather_ps(_simd16_setzero_ps(), - pBase, - indices, - _simd16_castsi_ps(mask), - 4 /* gcc doesn't like sizeof(float) */); - - verts[i].v[c] = - useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); -#else - verts[i].v[c] = _simd_mask_i32gather_ps(_simd_setzero_ps(), - pBase, - indices, - _simd_castsi_ps(mask), - 4); // gcc doesn't like sizeof(float) -#endif - if (m_SOA) - { - pBase += m_attributeStrideInVectors * SIMD_WIDTH; - } - else - { - pBase += sizeof(float); - } - } - } - - return true; - } - -#if ENABLE_AVX512_SIMD16 - bool Assemble(uint32_t slot, simd16vector verts[]) - { - SWR_ASSERT(slot < m_numAttributes); - - uint32_t numPrimsToAssemble = PA_TESS::NumPrims(); - if (0 == numPrimsToAssemble) - { - return false; - } - - SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble); - - const float* pBaseAttrib; - if (m_SOA) - { - pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; - } - else - { - const float* pVertData = (const float*)m_pVertexData; - pBaseAttrib = pVertData + slot * 4; - } - - for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) - { -#if USE_SIMD16_FRONTEND - SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]); - if (!m_SOA) - { - indices = _simd16_mullo_epi32(indices, _simd16_set1_epi32(vertexStride / 4)); - } -#else - SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]); -#endif - - const float* pBase = pBaseAttrib; - for (uint32_t c = 0; c < 4; ++c) - { -#if USE_SIMD16_FRONTEND - verts[i].v[c] = _simd16_mask_i32gather_ps(_simd16_setzero_ps(), - pBase, - indices, - _simd16_castsi_ps(mask), - 4 /* gcc doesn't like sizeof(float) */); -#else - simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), - pBase, - indices, - _simd_castsi_ps(mask), - 4 /* gcc doesn't like sizeof(float) */); - verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); -#endif - if (m_SOA) - { - pBase += m_attributeStrideInVectors * SIMD_WIDTH; - } - else - { - pBase++; - } - } - } - - return true; - } - -#endif - void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) - { - SWR_ASSERT(slot < m_numAttributes); - - - SWR_ASSERT(primIndex < PA_TESS::NumPrims()); - - const float* pVertDataBase; - if (m_SOA) - { - pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; - } - else - { - const float* pVertData = (const float*)m_pVertexData; - pVertDataBase = pVertData + slot * 4; - }; - for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) - { -#if USE_SIMD16_FRONTEND - uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] - : m_ppIndices[i][primIndex]; - if (!m_SOA) - { - index *= (vertexStride / 4); - } -#else - uint32_t index = m_ppIndices[i][primIndex]; -#endif - const float* pVertData = pVertDataBase; - float* pVert = (float*)&verts[i]; - - for (uint32_t c = 0; c < 4; ++c) - { - pVert[c] = pVertData[index]; - if (m_SOA) - { - pVertData += m_attributeStrideInVectors * SIMD_WIDTH; - } - else - { - pVertData++; - } - } - - } - } - - bool NextPrim() - { - uint32_t numPrims = PA_TESS::NumPrims(); - m_numPrims -= numPrims; - m_ppIndices[0] += numPrims; - m_ppIndices[1] += numPrims; - m_ppIndices[2] += numPrims; - - return HasWork(); - } - - SIMDVERTEX& GetNextVsOutput() - { - SWR_NOT_IMPL; - return junkVertex; - } - - bool GetNextStreamOutput() - { - SWR_NOT_IMPL; - return false; - } - - SIMDMASK& GetNextVsIndices() - { - SWR_NOT_IMPL; - return junkIndices; - } - - uint32_t NumPrims() { return std::min<uint32_t>(m_numPrims, SIMD_WIDTH); } - - void Reset() { SWR_NOT_IMPL; } - - SIMDSCALARI GetPrimID(uint32_t startID) - { -#if USE_SIMD16_FRONTEND - return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId); -#else - return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId); -#endif - } - -private: - const SIMDSCALAR* m_pVertexData = nullptr; - uint32_t m_attributeStrideInVectors = 0; - uint32_t m_numAttributes = 0; - uint32_t m_numPrims = 0; - uint32_t* m_ppIndices[3]; - - uint32_t m_numVertsPerPrim = 0; - - SIMDSCALARI m_vPrimId; - - simdvector junkVector; // junk simdvector for unimplemented API -#if ENABLE_AVX512_SIMD16 - simd16vector junkVector_simd16; // junk simd16vector for unimplemented API -#endif - SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API - SIMDMASK junkIndices; // temporary index store for unused virtual function - - bool m_SOA; -}; - -// Primitive Assembler factory class, responsible for creating and initializing the correct -// assembler based on state. -template <typename IsIndexedT, typename IsCutIndexEnabledT> -struct PA_FACTORY -{ - PA_FACTORY(DRAW_CONTEXT* pDC, - PRIMITIVE_TOPOLOGY in_topo, - uint32_t numVerts, - PA_STATE::SIMDVERTEX* pVertexStore, - uint32_t vertexStoreSize, - uint32_t vertexStride, - uint32_t numVertsPerPrim) : - topo(in_topo) - { -#if KNOB_ENABLE_CUT_AWARE_PA == TRUE - const API_STATE& state = GetApiState(pDC); - if ((IsIndexedT::value && IsCutIndexEnabledT::value && - (topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST || - topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) || - - // non-indexed draws with adjacency topologies must use cut-aware PA until we add - // support for them in the optimized PA - (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || - topo == TOP_TRI_STRIP_ADJ)) - { - memset(&indexStore, 0, sizeof(indexStore)); - uint32_t numAttribs = state.feNumAttributes; - - new (&this->paCut) PA_STATE_CUT(pDC, - reinterpret_cast<uint8_t*>(pVertexStore), - vertexStoreSize * PA_STATE::SIMD_WIDTH, - vertexStride, - &this->indexStore[0], - numVerts, - numAttribs, - state.topology, - false, - numVertsPerPrim); - cutPA = true; - } - else -#endif - { - uint32_t numPrims = GetNumPrims(in_topo, numVerts); - new (&this->paOpt) PA_STATE_OPT(pDC, - numPrims, - reinterpret_cast<uint8_t*>(pVertexStore), - vertexStoreSize * PA_STATE::SIMD_WIDTH, - vertexStride, - false, - numVertsPerPrim); - cutPA = false; - } - } - - PA_STATE& GetPA() - { -#if KNOB_ENABLE_CUT_AWARE_PA == TRUE - if (cutPA) - { - return this->paCut; - } - else -#endif - { - return this->paOpt; - } - } - - PA_STATE_OPT paOpt; - PA_STATE_CUT paCut; - - bool cutPA{false}; - - PRIMITIVE_TOPOLOGY topo{TOP_UNKNOWN}; - - PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM]; -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp deleted file mode 100644 index 25d7156ac63..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +++ /dev/null @@ -1,3141 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file pa_avx.cpp - * - * @brief AVX implementation for primitive assembly. - * N primitives are assembled at a time, where N is the SIMD width. - * A state machine, that is specific for a given topology, drives the - * assembly of vertices into triangles. - * - ******************************************************************************/ -#include "context.h" -#include "pa.h" -#include "frontend.h" - -#if (KNOB_SIMD_WIDTH == 8) - -INLINE simd4scalar swizzleLane0(const simdscalar& x, - const simdscalar& y, - const simdscalar& z, - const simdscalar& w) -{ - simdscalar tmp0 = _mm256_unpacklo_ps(x, z); - simdscalar tmp1 = _mm256_unpacklo_ps(y, w); - return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); -} - -INLINE simd4scalar swizzleLane1(const simdscalar& x, - const simdscalar& y, - const simdscalar& z, - const simdscalar& w) -{ - simdscalar tmp0 = _mm256_unpacklo_ps(x, z); - simdscalar tmp1 = _mm256_unpacklo_ps(y, w); - return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); -} - -INLINE simd4scalar swizzleLane2(const simdscalar& x, - const simdscalar& y, - const simdscalar& z, - const simdscalar& w) -{ - simdscalar tmp0 = _mm256_unpackhi_ps(x, z); - simdscalar tmp1 = _mm256_unpackhi_ps(y, w); - return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); -} - -INLINE simd4scalar swizzleLane3(const simdscalar& x, - const simdscalar& y, - const simdscalar& z, - const simdscalar& w) -{ - simdscalar tmp0 = _mm256_unpackhi_ps(x, z); - simdscalar tmp1 = _mm256_unpackhi_ps(y, w); - return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); -} - -INLINE simd4scalar swizzleLane4(const simdscalar& x, - const simdscalar& y, - const simdscalar& z, - const simdscalar& w) -{ - simdscalar tmp0 = _mm256_unpacklo_ps(x, z); - simdscalar tmp1 = _mm256_unpacklo_ps(y, w); - return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); -} - -INLINE simd4scalar swizzleLane5(const simdscalar& x, - const simdscalar& y, - const simdscalar& z, - const simdscalar& w) -{ - simdscalar tmp0 = _mm256_unpacklo_ps(x, z); - simdscalar tmp1 = _mm256_unpacklo_ps(y, w); - return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); -} - -INLINE simd4scalar swizzleLane6(const simdscalar& x, - const simdscalar& y, - const simdscalar& z, - const simdscalar& w) -{ - simdscalar tmp0 = _mm256_unpackhi_ps(x, z); - simdscalar tmp1 = _mm256_unpackhi_ps(y, w); - return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); -} - -INLINE simd4scalar swizzleLane7(const simdscalar& x, - const simdscalar& y, - const simdscalar& z, - const simdscalar& w) -{ - simdscalar tmp0 = _mm256_unpackhi_ps(x, z); - simdscalar tmp1 = _mm256_unpackhi_ps(y, w); - return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); -} - -INLINE simd4scalar swizzleLane0(const simdvector& v) -{ - return swizzleLane0(v.x, v.y, v.z, v.w); -} - -INLINE simd4scalar swizzleLane1(const simdvector& v) -{ - return swizzleLane1(v.x, v.y, v.z, v.w); -} - -INLINE simd4scalar swizzleLane2(const simdvector& v) -{ - return swizzleLane2(v.x, v.y, v.z, v.w); -} - -INLINE simd4scalar swizzleLane3(const simdvector& v) -{ - return swizzleLane3(v.x, v.y, v.z, v.w); -} - -INLINE simd4scalar swizzleLane4(const simdvector& v) -{ - return swizzleLane4(v.x, v.y, v.z, v.w); -} - -INLINE simd4scalar swizzleLane5(const simdvector& v) -{ - return swizzleLane5(v.x, v.y, v.z, v.w); -} - -INLINE simd4scalar swizzleLane6(const simdvector& v) -{ - return swizzleLane6(v.x, v.y, v.z, v.w); -} - -INLINE simd4scalar swizzleLane7(const simdvector& v) -{ - return swizzleLane7(v.x, v.y, v.z, v.w); -} - -INLINE simd4scalar swizzleLaneN(const simdvector& v, int lane) -{ - switch (lane) - { - case 0: - return swizzleLane0(v); - case 1: - return swizzleLane1(v); - case 2: - return swizzleLane2(v); - case 3: - return swizzleLane3(v); - case 4: - return swizzleLane4(v); - case 5: - return swizzleLane5(v); - case 6: - return swizzleLane6(v); - case 7: - return swizzleLane7(v); - default: - return _mm_setzero_ps(); - } -} - -#if ENABLE_AVX512_SIMD16 -INLINE simd4scalar swizzleLane0(const simd16vector& v) -{ - return swizzleLane0(_simd16_extract_ps(v.x, 0), - _simd16_extract_ps(v.y, 0), - _simd16_extract_ps(v.z, 0), - _simd16_extract_ps(v.w, 0)); -} - -INLINE simd4scalar swizzleLane1(const simd16vector& v) -{ - return swizzleLane1(_simd16_extract_ps(v.x, 0), - _simd16_extract_ps(v.y, 0), - _simd16_extract_ps(v.z, 0), - _simd16_extract_ps(v.w, 0)); -} - -INLINE simd4scalar swizzleLane2(const simd16vector& v) -{ - return swizzleLane2(_simd16_extract_ps(v.x, 0), - _simd16_extract_ps(v.y, 0), - _simd16_extract_ps(v.z, 0), - _simd16_extract_ps(v.w, 0)); -} - -INLINE simd4scalar swizzleLane3(const simd16vector& v) -{ - return swizzleLane3(_simd16_extract_ps(v.x, 0), - _simd16_extract_ps(v.y, 0), - _simd16_extract_ps(v.z, 0), - _simd16_extract_ps(v.w, 0)); -} - -INLINE simd4scalar swizzleLane4(const simd16vector& v) -{ - return swizzleLane4(_simd16_extract_ps(v.x, 0), - _simd16_extract_ps(v.y, 0), - _simd16_extract_ps(v.z, 0), - _simd16_extract_ps(v.w, 0)); -} - -INLINE simd4scalar swizzleLane5(const simd16vector& v) -{ - return swizzleLane5(_simd16_extract_ps(v.x, 0), - _simd16_extract_ps(v.y, 0), - _simd16_extract_ps(v.z, 0), - _simd16_extract_ps(v.w, 0)); -} - -INLINE simd4scalar swizzleLane6(const simd16vector& v) -{ - return swizzleLane6(_simd16_extract_ps(v.x, 0), - _simd16_extract_ps(v.y, 0), - _simd16_extract_ps(v.z, 0), - _simd16_extract_ps(v.w, 0)); -} - -INLINE simd4scalar swizzleLane7(const simd16vector& v) -{ - return swizzleLane7(_simd16_extract_ps(v.x, 0), - _simd16_extract_ps(v.y, 0), - _simd16_extract_ps(v.z, 0), - _simd16_extract_ps(v.w, 0)); -} - -INLINE simd4scalar swizzleLane8(const simd16vector& v) -{ - return swizzleLane0(_simd16_extract_ps(v.x, 1), - _simd16_extract_ps(v.y, 1), - _simd16_extract_ps(v.z, 1), - _simd16_extract_ps(v.w, 1)); -} - -INLINE simd4scalar swizzleLane9(const simd16vector& v) -{ - return swizzleLane1(_simd16_extract_ps(v.x, 1), - _simd16_extract_ps(v.y, 1), - _simd16_extract_ps(v.z, 1), - _simd16_extract_ps(v.w, 1)); -} - -INLINE simd4scalar swizzleLaneA(const simd16vector& v) -{ - return swizzleLane2(_simd16_extract_ps(v.x, 1), - _simd16_extract_ps(v.y, 1), - _simd16_extract_ps(v.z, 1), - _simd16_extract_ps(v.w, 1)); -} - -INLINE simd4scalar swizzleLaneB(const simd16vector& v) -{ - return swizzleLane3(_simd16_extract_ps(v.x, 1), - _simd16_extract_ps(v.y, 1), - _simd16_extract_ps(v.z, 1), - _simd16_extract_ps(v.w, 1)); -} - -INLINE simd4scalar swizzleLaneC(const simd16vector& v) -{ - return swizzleLane4(_simd16_extract_ps(v.x, 1), - _simd16_extract_ps(v.y, 1), - _simd16_extract_ps(v.z, 1), - _simd16_extract_ps(v.w, 1)); -} - -INLINE simd4scalar swizzleLaneD(const simd16vector& v) -{ - return swizzleLane5(_simd16_extract_ps(v.x, 1), - _simd16_extract_ps(v.y, 1), - _simd16_extract_ps(v.z, 1), - _simd16_extract_ps(v.w, 1)); -} - -INLINE simd4scalar swizzleLaneE(const simd16vector& v) -{ - return swizzleLane6(_simd16_extract_ps(v.x, 1), - _simd16_extract_ps(v.y, 1), - _simd16_extract_ps(v.z, 1), - _simd16_extract_ps(v.w, 1)); -} - -INLINE simd4scalar swizzleLaneF(const simd16vector& v) -{ - return swizzleLane7(_simd16_extract_ps(v.x, 1), - _simd16_extract_ps(v.y, 1), - _simd16_extract_ps(v.z, 1), - _simd16_extract_ps(v.w, 1)); -} - -INLINE simd4scalar swizzleLaneN(const simd16vector& v, int lane) -{ - switch (lane) - { - case 0: - return swizzleLane0(v); - case 1: - return swizzleLane1(v); - case 2: - return swizzleLane2(v); - case 3: - return swizzleLane3(v); - case 4: - return swizzleLane4(v); - case 5: - return swizzleLane5(v); - case 6: - return swizzleLane6(v); - case 7: - return swizzleLane7(v); - case 8: - return swizzleLane8(v); - case 9: - return swizzleLane9(v); - case 10: - return swizzleLaneA(v); - case 11: - return swizzleLaneB(v); - case 12: - return swizzleLaneC(v); - case 13: - return swizzleLaneD(v); - case 14: - return swizzleLaneE(v); - case 15: - return swizzleLaneF(v); - default: - return _mm_setzero_ps(); - } -} - -#endif -bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -#if ENABLE_AVX512_SIMD16 -bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -#endif -void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); - -bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -#if ENABLE_AVX512_SIMD16 -bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -#endif -void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); - -bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -#if ENABLE_AVX512_SIMD16 -bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -#endif -void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); - -bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -#if ENABLE_AVX512_SIMD16 -bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -#endif -void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); - -bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -#if ENABLE_AVX512_SIMD16 -bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -#endif -void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); - -bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -#if ENABLE_AVX512_SIMD16 -bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -#endif -void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); - -bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -#if ENABLE_AVX512_SIMD16 -bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -#endif -void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); - -bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -#if ENABLE_AVX512_SIMD16 -bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -#endif -void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); - -bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -#if ENABLE_AVX512_SIMD16 -bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); -#endif -void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]); - -template <uint32_t TotalControlPoints> -void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) -{ - // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output - // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute. - // Each attribute has 4 components. - - /// @todo Optimize this - -#if USE_SIMD16_FRONTEND - if (pa.useAlternateOffset) - { - primIndex += KNOB_SIMD_WIDTH; - } - -#endif - float* pOutVec = (float*)verts; - - for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) - { - uint32_t input_cp = primIndex * TotalControlPoints + cp; -#if USE_SIMD16_FRONTEND - uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; - uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH; - -#else - uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; - uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; - -#endif - // Loop over all components of the attribute - for (uint32_t i = 0; i < 4; ++i) - { -#if USE_SIMD16_FRONTEND - const float* pInputVec = - (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]); -#else - const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); -#endif - pOutVec[cp * 4 + i] = pInputVec[input_lane]; - } - } -} - -template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1> -static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SetNextPaState(pa, - PaPatchList<TotalControlPoints, CurrentControlPoints + 1>, - PaPatchListSingle<TotalControlPoints>); - - return false; -} - -template <uint32_t TotalControlPoints> -static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output - // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute. - // Each attribute has 4 components. - - /// @todo Optimize this - -#if USE_SIMD16_FRONTEND - uint32_t lane_offset = 0; - - if (pa.useAlternateOffset) - { - lane_offset = KNOB_SIMD_WIDTH; - } - -#endif - // Loop over all components of the attribute - for (uint32_t i = 0; i < 4; ++i) - { - for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) - { - float vec[KNOB_SIMD_WIDTH]; - for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane) - { -#if USE_SIMD16_FRONTEND - uint32_t input_cp = (lane + lane_offset) * TotalControlPoints + cp; - uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; - uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH; - - const float* pInputVec = - (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]); -#else - uint32_t input_cp = lane * TotalControlPoints + cp; - uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; - uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; - - const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); -#endif - vec[lane] = pInputVec[input_lane]; - } - verts[cp][i] = _simd_loadu_ps(vec); - } - } - - SetNextPaState(pa, - PaPatchList<TotalControlPoints>, - PaPatchListSingle<TotalControlPoints>, - 0, - PA_STATE_OPT::SIMD_WIDTH, - true); - - return true; -} - -#if ENABLE_AVX512_SIMD16 -template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1> -static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SetNextPaState_simd16(pa, - PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>, - PaPatchList<TotalControlPoints, CurrentControlPoints + 1>, - PaPatchListSingle<TotalControlPoints>); - - return false; -} - -template <uint32_t TotalControlPoints> -static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output - // KNOB_SIMD16_WIDTH * 1 patch. This function is called once per attribute. - // Each attribute has 4 components. - - /// @todo Optimize this - - // Loop over all components of the attribute - for (uint32_t i = 0; i < 4; ++i) - { - for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) - { - float vec[KNOB_SIMD16_WIDTH]; - for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane) - { - uint32_t input_cp = lane * TotalControlPoints + cp; - uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; - uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH; - - const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); - vec[lane] = pInputVec[input_lane]; - } - verts[cp][i] = _simd16_loadu_ps(vec); - } - } - - SetNextPaState_simd16(pa, - PaPatchList_simd16<TotalControlPoints>, - PaPatchList<TotalControlPoints>, - PaPatchListSingle<TotalControlPoints>, - 0, - PA_STATE_OPT::SIMD_WIDTH, - true); - - return true; -} - -#endif -#define PA_PATCH_LIST_TERMINATOR(N) \ - template <> \ - bool PaPatchList<N, N>(PA_STATE_OPT & pa, uint32_t slot, simdvector verts[]) \ - { \ - return PaPatchListTerm<N>(pa, slot, verts); \ - } -PA_PATCH_LIST_TERMINATOR(1) -PA_PATCH_LIST_TERMINATOR(2) -PA_PATCH_LIST_TERMINATOR(3) -PA_PATCH_LIST_TERMINATOR(4) -PA_PATCH_LIST_TERMINATOR(5) -PA_PATCH_LIST_TERMINATOR(6) -PA_PATCH_LIST_TERMINATOR(7) -PA_PATCH_LIST_TERMINATOR(8) -PA_PATCH_LIST_TERMINATOR(9) -PA_PATCH_LIST_TERMINATOR(10) -PA_PATCH_LIST_TERMINATOR(11) -PA_PATCH_LIST_TERMINATOR(12) -PA_PATCH_LIST_TERMINATOR(13) -PA_PATCH_LIST_TERMINATOR(14) -PA_PATCH_LIST_TERMINATOR(15) -PA_PATCH_LIST_TERMINATOR(16) -PA_PATCH_LIST_TERMINATOR(17) -PA_PATCH_LIST_TERMINATOR(18) -PA_PATCH_LIST_TERMINATOR(19) -PA_PATCH_LIST_TERMINATOR(20) -PA_PATCH_LIST_TERMINATOR(21) -PA_PATCH_LIST_TERMINATOR(22) -PA_PATCH_LIST_TERMINATOR(23) -PA_PATCH_LIST_TERMINATOR(24) -PA_PATCH_LIST_TERMINATOR(25) -PA_PATCH_LIST_TERMINATOR(26) -PA_PATCH_LIST_TERMINATOR(27) -PA_PATCH_LIST_TERMINATOR(28) -PA_PATCH_LIST_TERMINATOR(29) -PA_PATCH_LIST_TERMINATOR(30) -PA_PATCH_LIST_TERMINATOR(31) -PA_PATCH_LIST_TERMINATOR(32) -#undef PA_PATCH_LIST_TERMINATOR - -#if ENABLE_AVX512_SIMD16 -#define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \ - template <> \ - bool PaPatchList_simd16<N, N>(PA_STATE_OPT & pa, uint32_t slot, simd16vector verts[]) \ - { \ - return PaPatchListTerm_simd16<N>(pa, slot, verts); \ - } -PA_PATCH_LIST_TERMINATOR_SIMD16(1) -PA_PATCH_LIST_TERMINATOR_SIMD16(2) -PA_PATCH_LIST_TERMINATOR_SIMD16(3) -PA_PATCH_LIST_TERMINATOR_SIMD16(4) -PA_PATCH_LIST_TERMINATOR_SIMD16(5) -PA_PATCH_LIST_TERMINATOR_SIMD16(6) -PA_PATCH_LIST_TERMINATOR_SIMD16(7) -PA_PATCH_LIST_TERMINATOR_SIMD16(8) -PA_PATCH_LIST_TERMINATOR_SIMD16(9) -PA_PATCH_LIST_TERMINATOR_SIMD16(10) -PA_PATCH_LIST_TERMINATOR_SIMD16(11) -PA_PATCH_LIST_TERMINATOR_SIMD16(12) -PA_PATCH_LIST_TERMINATOR_SIMD16(13) -PA_PATCH_LIST_TERMINATOR_SIMD16(14) -PA_PATCH_LIST_TERMINATOR_SIMD16(15) -PA_PATCH_LIST_TERMINATOR_SIMD16(16) -PA_PATCH_LIST_TERMINATOR_SIMD16(17) -PA_PATCH_LIST_TERMINATOR_SIMD16(18) -PA_PATCH_LIST_TERMINATOR_SIMD16(19) -PA_PATCH_LIST_TERMINATOR_SIMD16(20) -PA_PATCH_LIST_TERMINATOR_SIMD16(21) -PA_PATCH_LIST_TERMINATOR_SIMD16(22) -PA_PATCH_LIST_TERMINATOR_SIMD16(23) -PA_PATCH_LIST_TERMINATOR_SIMD16(24) -PA_PATCH_LIST_TERMINATOR_SIMD16(25) -PA_PATCH_LIST_TERMINATOR_SIMD16(26) -PA_PATCH_LIST_TERMINATOR_SIMD16(27) -PA_PATCH_LIST_TERMINATOR_SIMD16(28) -PA_PATCH_LIST_TERMINATOR_SIMD16(29) -PA_PATCH_LIST_TERMINATOR_SIMD16(30) -PA_PATCH_LIST_TERMINATOR_SIMD16(31) -PA_PATCH_LIST_TERMINATOR_SIMD16(32) -#undef PA_PATCH_LIST_TERMINATOR_SIMD16 - -#endif -bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SetNextPaState(pa, PaTriList1, PaTriListSingle0); - return false; // Not enough vertices to assemble 4 or 8 triangles. -} - -bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SetNextPaState(pa, PaTriList2, PaTriListSingle0); - return false; // Not enough vertices to assemble 8 triangles. -} - -bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ -#if KNOB_ARCH == KNOB_ARCH_AVX -#if USE_SIMD16_FRONTEND - simdvector a; - simdvector b; - simdvector c; - - if (!pa.useAlternateOffset) - { - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 0); - b[i] = _simd16_extract_ps(a_16[i], 1); - c[i] = _simd16_extract_ps(b_16[i], 0); - } - } - else - { - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); - const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(b_16[i], 1); - b[i] = _simd16_extract_ps(c_16[i], 0); - c[i] = _simd16_extract_ps(c_16[i], 1); - } - } - -#else - simdvector& a = PaGetSimdVector(pa, 0, slot); - simdvector& b = PaGetSimdVector(pa, 1, slot); - simdvector& c = PaGetSimdVector(pa, 2, slot); - -#endif - simdscalar s; - - // Tri Pattern - provoking vertex is always v0 - // v0 -> 0 3 6 9 12 15 18 21 - // v1 -> 1 4 7 10 13 16 19 22 - // v2 -> 2 5 8 11 14 17 20 23 - - for (int i = 0; i < 4; ++i) - { - simdvector& v0 = verts[0]; - v0[i] = _simd_blend_ps(a[i], b[i], 0x92); - v0[i] = _simd_blend_ps(v0[i], c[i], 0x24); - v0[i] = _simd_permute_ps_i(v0[i], 0x6C); - s = _simd_permute2f128_ps(v0[i], v0[i], 0x21); - v0[i] = _simd_blend_ps(v0[i], s, 0x44); - - simdvector& v1 = verts[1]; - v1[i] = _simd_blend_ps(a[i], b[i], 0x24); - v1[i] = _simd_blend_ps(v1[i], c[i], 0x49); - v1[i] = _simd_permute_ps_i(v1[i], 0xB1); - s = _simd_permute2f128_ps(v1[i], v1[i], 0x21); - v1[i] = _simd_blend_ps(v1[i], s, 0x66); - - simdvector& v2 = verts[2]; - v2[i] = _simd_blend_ps(a[i], b[i], 0x49); - v2[i] = _simd_blend_ps(v2[i], c[i], 0x92); - v2[i] = _simd_permute_ps_i(v2[i], 0xC6); - s = _simd_permute2f128_ps(v2[i], v2[i], 0x21); - v2[i] = _simd_blend_ps(v2[i], s, 0x22); - } - -#elif KNOB_ARCH >= KNOB_ARCH_AVX2 - const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0); - const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1); - const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2); - -#if USE_SIMD16_FRONTEND - simdvector a; - simdvector b; - simdvector c; - - if (!pa.useAlternateOffset) - { - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 0); - b[i] = _simd16_extract_ps(a_16[i], 1); - c[i] = _simd16_extract_ps(b_16[i], 0); - } - } - else - { - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); - const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(b_16[i], 1); - b[i] = _simd16_extract_ps(c_16[i], 0); - c[i] = _simd16_extract_ps(c_16[i], 1); - } - } - -#else - const simdvector& a = PaGetSimdVector(pa, 0, slot); - const simdvector& b = PaGetSimdVector(pa, 1, slot); - const simdvector& c = PaGetSimdVector(pa, 2, slot); - -#endif - // v0 -> a0 a3 a6 b1 b4 b7 c2 c5 - // v1 -> a1 a4 a7 b2 b5 c0 c3 c6 - // v2 -> a2 a5 b0 b3 b6 c1 c4 c7 - - simdvector& v0 = verts[0]; - simdvector& v1 = verts[1]; - simdvector& v2 = verts[2]; - - // for simd x, y, z, and w - for (int i = 0; i < 4; ++i) - { - simdscalar temp0 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24); - simdscalar temp1 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49); - simdscalar temp2 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92); - - v0[i] = _simd_permute_ps(temp0, perm0); - v1[i] = _simd_permute_ps(temp1, perm1); - v2[i] = _simd_permute_ps(temp2, perm2); - } - -#endif - SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; -} - -#if ENABLE_AVX512_SIMD16 -bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0); - return false; // Not enough vertices to assemble 16 triangles -} - -bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0); - return false; // Not enough vertices to assemble 16 triangles -} - -bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - // clang-format off - -#if KNOB_ARCH >= KNOB_ARCH_AVX2 - const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0); - const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1); - const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2); -#else // KNOB_ARCH == KNOB_ARCH_AVX - simd16scalar perm0 = _simd16_setzero_ps(); - simd16scalar perm1 = _simd16_setzero_ps(); - simd16scalar perm2 = _simd16_setzero_ps(); -#endif - - const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); - const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot); - - const simd16mask mask0 = 0x4924; - const simd16mask mask1 = 0x2492; - const simd16mask mask2 = 0x9249; - - // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD - // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE - // v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF - - simd16vector& v0 = verts[0]; - simd16vector& v1 = verts[1]; - simd16vector& v2 = verts[2]; - - // for simd16 x, y, z, and w - for (int i = 0; i < 4; i += 1) - { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); - simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i])); - - simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask0), tempc, mask1); - simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask2), tempc, mask0); - simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask1), tempc, mask2); - -#if KNOB_ARCH >= KNOB_ARCH_AVX2 - v0[i] = _simd16_permute_ps(temp0, perm0); - v1[i] = _simd16_permute_ps(temp1, perm1); - v2[i] = _simd16_permute_ps(temp2, perm2); -#else // #if KNOB_ARCH == KNOB_ARCH_AVX - - // the general permutes (above) are prohibitively slow to emulate on AVX (its scalar code) - - temp0 = _simd16_permute_ps_i(temp0, 0x6C); // (0, 3, 2, 1) => 00 11 01 10 => 0x6C - perm0 = _simd16_permute2f128_ps(temp0, temp0, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1 - temp0 = _simd16_blend_ps(temp0, perm0, 0x4444); // 0010 0010 0010 0010 - perm0 = _simd16_permute2f128_ps(temp0, temp0, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E - v0[i] = _simd16_blend_ps(temp0, perm0, 0x3838); // 0001 1100 0001 1100 - - temp1 = _simd16_permute_ps_i(temp1, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1 - perm1 = _simd16_permute2f128_ps(temp1, temp1, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1 - temp1 = _simd16_blend_ps(temp1, perm1, 0x6666); // 0010 0010 0010 0010 - perm1 = _simd16_permute2f128_ps(temp1, temp1, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E - v1[i] = _simd16_blend_ps(temp1, perm1, 0x1818); // 0001 1000 0001 1000 - - temp2 = _simd16_permute_ps_i(temp2, 0xC6); // (2, 1, 0, 3) => 01 10 00 11 => 0xC6 - perm2 = _simd16_permute2f128_ps(temp2, temp2, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1 - temp2 = _simd16_blend_ps(temp2, perm2, 0x2222); // 0100 0100 0100 0100 - perm2 = _simd16_permute2f128_ps(temp2, temp2, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E - v2[i] = _simd16_blend_ps(temp2, perm2, 0x1C1C); // 0011 1000 0011 1000 -#endif - } - - SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; - - // clang-format on -} - -#endif -void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) -{ -#if USE_SIMD16_FRONTEND - const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); - const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot); - - if (pa.useAlternateOffset) - { - primIndex += KNOB_SIMD_WIDTH; - } - - // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD - // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE - // v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF - - switch (primIndex) - { - case 0: - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - verts[2] = swizzleLane2(a); - break; - case 1: - verts[0] = swizzleLane3(a); - verts[1] = swizzleLane4(a); - verts[2] = swizzleLane5(a); - break; - case 2: - verts[0] = swizzleLane6(a); - verts[1] = swizzleLane7(a); - verts[2] = swizzleLane8(a); - break; - case 3: - verts[0] = swizzleLane9(a); - verts[1] = swizzleLaneA(a); - verts[2] = swizzleLaneB(a); - break; - case 4: - verts[0] = swizzleLaneC(a); - verts[1] = swizzleLaneD(a); - verts[2] = swizzleLaneE(a); - break; - case 5: - verts[0] = swizzleLaneF(a); - verts[1] = swizzleLane0(b); - verts[2] = swizzleLane1(b); - break; - case 6: - verts[0] = swizzleLane2(b); - verts[1] = swizzleLane3(b); - verts[2] = swizzleLane4(b); - break; - case 7: - verts[0] = swizzleLane5(b); - verts[1] = swizzleLane6(b); - verts[2] = swizzleLane7(b); - break; - case 8: - verts[0] = swizzleLane8(b); - verts[1] = swizzleLane9(b); - verts[2] = swizzleLaneA(b); - break; - case 9: - verts[0] = swizzleLaneB(b); - verts[1] = swizzleLaneC(b); - verts[2] = swizzleLaneD(b); - break; - case 10: - verts[0] = swizzleLaneE(b); - verts[1] = swizzleLaneF(b); - verts[2] = swizzleLane0(c); - break; - case 11: - verts[0] = swizzleLane1(c); - verts[1] = swizzleLane2(c); - verts[2] = swizzleLane3(c); - break; - case 12: - verts[0] = swizzleLane4(c); - verts[1] = swizzleLane5(c); - verts[2] = swizzleLane6(c); - break; - case 13: - verts[0] = swizzleLane7(c); - verts[1] = swizzleLane8(c); - verts[2] = swizzleLane9(c); - break; - case 14: - verts[0] = swizzleLaneA(c); - verts[1] = swizzleLaneB(c); - verts[2] = swizzleLaneC(c); - break; - case 15: - verts[0] = swizzleLaneD(c); - verts[1] = swizzleLaneE(c); - verts[2] = swizzleLaneF(c); - break; - }; -#else - // We have 12 simdscalars contained within 3 simdvectors which - // hold at least 8 triangles worth of data. We want to assemble a single - // triangle with data in horizontal form. - - const simdvector& a = PaGetSimdVector(pa, 0, slot); - const simdvector& b = PaGetSimdVector(pa, 1, slot); - const simdvector& c = PaGetSimdVector(pa, 2, slot); - - // Convert from vertical to horizontal. - // Tri Pattern - provoking vertex is always v0 - // v0 -> 0 3 6 9 12 15 18 21 - // v1 -> 1 4 7 10 13 16 19 22 - // v2 -> 2 5 8 11 14 17 20 23 - - switch (primIndex) - { - case 0: - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - verts[2] = swizzleLane2(a); - break; - case 1: - verts[0] = swizzleLane3(a); - verts[1] = swizzleLane4(a); - verts[2] = swizzleLane5(a); - break; - case 2: - verts[0] = swizzleLane6(a); - verts[1] = swizzleLane7(a); - verts[2] = swizzleLane0(b); - break; - case 3: - verts[0] = swizzleLane1(b); - verts[1] = swizzleLane2(b); - verts[2] = swizzleLane3(b); - break; - case 4: - verts[0] = swizzleLane4(b); - verts[1] = swizzleLane5(b); - verts[2] = swizzleLane6(b); - break; - case 5: - verts[0] = swizzleLane7(b); - verts[1] = swizzleLane0(c); - verts[2] = swizzleLane1(c); - break; - case 6: - verts[0] = swizzleLane2(c); - verts[1] = swizzleLane3(c); - verts[2] = swizzleLane4(c); - break; - case 7: - verts[0] = swizzleLane5(c); - verts[1] = swizzleLane6(c); - verts[2] = swizzleLane7(c); - break; - }; -#endif -} - -bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0); - return false; // Not enough vertices to assemble 8 triangles. -} - -bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ -#if USE_SIMD16_FRONTEND - simdvector a; - simdvector b; - - if (!pa.useAlternateOffset) - { - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 0); - b[i] = _simd16_extract_ps(a_16[i], 1); - } - } - else - { - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(b_16[i], 0); - b[i] = _simd16_extract_ps(b_16[i], 1); - } - } - -#else - simdvector& a = PaGetSimdVector(pa, pa.prev, slot); - simdvector& b = PaGetSimdVector(pa, pa.cur, slot); - -#endif - simdscalar s; - - for (int i = 0; i < 4; ++i) - { - simdscalar a0 = a[i]; - simdscalar b0 = b[i]; - - // Tri Pattern - provoking vertex is always v0 - // v0 -> 01234567 - // v1 -> 13355779 - // v2 -> 22446688 - simdvector& v0 = verts[0]; - v0[i] = a0; - - // s -> 4567891011 - s = _simd_permute2f128_ps(a0, b0, 0x21); - // s -> 23456789 - s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); - - simdvector& v1 = verts[1]; - // v1 -> 13355779 - v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1)); - - simdvector& v2 = verts[2]; - // v2 -> 22446688 - v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2)); - } - - SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); - return true; -} - -#if ENABLE_AVX512_SIMD16 -bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0); - return false; // Not enough vertices to assemble 16 triangles. -} - -bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - // clang-format off - - const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot); - - const simd16mask mask0 = 0xF000; - - // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF - // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1 - // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0 - - simd16vector& v0 = verts[0]; - simd16vector& v1 = verts[1]; - simd16vector& v2 = verts[2]; - - // for simd16 x, y, z, and w - for (int i = 0; i < 4; i += 1) - { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); - - simd16scalar perm0 = _simd16_permute2f128_ps(tempa, tempa, 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3 - simd16scalar perm1 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3 - - simd16scalar blend = _simd16_blend_ps(perm0, perm1, mask0); // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3 - simd16scalar shuff = _simd16_shuffle_ps(tempa, blend, _MM_SHUFFLE(1, 0, 3, 2)); // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 - - v0[i] = tempa; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF - v1[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1 - v2[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0 - } - - SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); - return true; - - // clang-format on -} - -#endif -void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) -{ -#if USE_SIMD16_FRONTEND - const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot); - - if (pa.useAlternateOffset) - { - primIndex += KNOB_SIMD_WIDTH; - } - - // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF - // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1 - // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0 - - switch (primIndex) - { - case 0: - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - verts[2] = swizzleLane2(a); - break; - case 1: - verts[0] = swizzleLane1(a); - verts[1] = swizzleLane3(a); - verts[2] = swizzleLane2(a); - break; - case 2: - verts[0] = swizzleLane2(a); - verts[1] = swizzleLane3(a); - verts[2] = swizzleLane4(a); - break; - case 3: - verts[0] = swizzleLane3(a); - verts[1] = swizzleLane5(a); - verts[2] = swizzleLane4(a); - break; - case 4: - verts[0] = swizzleLane4(a); - verts[1] = swizzleLane5(a); - verts[2] = swizzleLane6(a); - break; - case 5: - verts[0] = swizzleLane5(a); - verts[1] = swizzleLane7(a); - verts[2] = swizzleLane6(a); - break; - case 6: - verts[0] = swizzleLane6(a); - verts[1] = swizzleLane7(a); - verts[2] = swizzleLane8(a); - break; - case 7: - verts[0] = swizzleLane7(a); - verts[1] = swizzleLane9(a); - verts[2] = swizzleLane8(a); - break; - case 8: - verts[0] = swizzleLane8(a); - verts[1] = swizzleLane9(a); - verts[2] = swizzleLaneA(a); - break; - case 9: - verts[0] = swizzleLane9(a); - verts[1] = swizzleLaneB(a); - verts[2] = swizzleLaneA(a); - break; - case 10: - verts[0] = swizzleLaneA(a); - verts[1] = swizzleLaneB(a); - verts[2] = swizzleLaneC(a); - break; - case 11: - verts[0] = swizzleLaneB(a); - verts[1] = swizzleLaneD(a); - verts[2] = swizzleLaneC(a); - break; - case 12: - verts[0] = swizzleLaneC(a); - verts[1] = swizzleLaneD(a); - verts[2] = swizzleLaneE(a); - break; - case 13: - verts[0] = swizzleLaneD(a); - verts[1] = swizzleLaneF(a); - verts[2] = swizzleLaneE(a); - break; - case 14: - verts[0] = swizzleLaneE(a); - verts[1] = swizzleLaneF(a); - verts[2] = swizzleLane0(b); - break; - case 15: - verts[0] = swizzleLaneF(a); - verts[1] = swizzleLane1(b); - verts[2] = swizzleLane0(b); - break; - }; -#else - const simdvector& a = PaGetSimdVector(pa, pa.prev, slot); - const simdvector& b = PaGetSimdVector(pa, pa.cur, slot); - - // Convert from vertical to horizontal. - // Tri Pattern - provoking vertex is always v0 - // v0 -> 01234567 - // v1 -> 13355779 - // v2 -> 22446688 - - switch (primIndex) - { - case 0: - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - verts[2] = swizzleLane2(a); - break; - case 1: - verts[0] = swizzleLane1(a); - verts[1] = swizzleLane3(a); - verts[2] = swizzleLane2(a); - break; - case 2: - verts[0] = swizzleLane2(a); - verts[1] = swizzleLane3(a); - verts[2] = swizzleLane4(a); - break; - case 3: - verts[0] = swizzleLane3(a); - verts[1] = swizzleLane5(a); - verts[2] = swizzleLane4(a); - break; - case 4: - verts[0] = swizzleLane4(a); - verts[1] = swizzleLane5(a); - verts[2] = swizzleLane6(a); - break; - case 5: - verts[0] = swizzleLane5(a); - verts[1] = swizzleLane7(a); - verts[2] = swizzleLane6(a); - break; - case 6: - verts[0] = swizzleLane6(a); - verts[1] = swizzleLane7(a); - verts[2] = swizzleLane0(b); - break; - case 7: - verts[0] = swizzleLane7(a); - verts[1] = swizzleLane1(b); - verts[2] = swizzleLane0(b); - break; - }; -#endif -} - -bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SetNextPaState(pa, PaTriFan1, PaTriFanSingle0); - return false; // Not enough vertices to assemble 8 triangles. -} - -bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ -#if USE_SIMD16_FRONTEND - simdvector leadVert; - simdvector a; - simdvector b; - - const simd16vector& leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot); - - if (!pa.useAlternateOffset) - { - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0); - - a[i] = _simd16_extract_ps(a_16[i], 0); - b[i] = _simd16_extract_ps(a_16[i], 1); - } - } - else - { - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0); - - a[i] = _simd16_extract_ps(b_16[i], 0); - b[i] = _simd16_extract_ps(b_16[i], 1); - } - } - -#else - const simdvector& leadVert = PaGetSimdVector(pa, pa.first, slot); - const simdvector& a = PaGetSimdVector(pa, pa.prev, slot); - const simdvector& b = PaGetSimdVector(pa, pa.cur, slot); - -#endif - simdscalar s; - - // need to fill vectors 1/2 with new verts, and v0 with anchor vert. - for (int i = 0; i < 4; ++i) - { - simdscalar a0 = a[i]; - simdscalar b0 = b[i]; - - simdscalar comp = leadVert[i]; - - simdvector& v0 = verts[0]; - v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0)); - v0[i] = _simd_permute2f128_ps(v0[i], comp, 0x00); - - simdvector& v2 = verts[2]; - s = _simd_permute2f128_ps(a0, b0, 0x21); - v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); - - simdvector& v1 = verts[1]; - v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1)); - } - - SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); - return true; -} - -#if ENABLE_AVX512_SIMD16 -bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0); - return false; // Not enough vertices to assemble 16 triangles. -} - -bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - // clang-format off - - const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot); - - const simd16mask mask0 = 0xF000; - - // v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 - // v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 - // v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 - - simd16vector& v0 = verts[0]; - simd16vector& v1 = verts[1]; - simd16vector& v2 = verts[2]; - - // for simd16 x, y, z, and w - for (uint32_t i = 0; i < 4; i += 1) - { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); - simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i])); - - simd16scalar shuff = _simd16_shuffle_ps(tempa, tempa, _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4 - - v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00); // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 - - simd16scalar temp0 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3 - simd16scalar temp1 = _simd16_permute2f128_ps(tempc, tempc, 0x39); // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3 - - simd16scalar blend = _simd16_blend_ps(temp0, temp1, mask0); // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3 - - simd16scalar temp2 = _simd16_shuffle_ps(tempb, blend, _MM_SHUFFLE(1, 0, 3, 2)); // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 - - v1[i] = _simd16_shuffle_ps(tempb, temp2, _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 - v2[i] = temp2; // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 - } - - SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); - return true; - - // clang-format on -} - -#endif -void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) -{ -#if USE_SIMD16_FRONTEND - const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot); - - if (pa.useAlternateOffset) - { - primIndex += KNOB_SIMD_WIDTH; - } - - // v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 - // v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 - // v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 - - // vert 0 from leading vertex - verts[0] = swizzleLane0(a); - - // vert 1 - if (primIndex < 15) - { - verts[1] = swizzleLaneN(b, primIndex + 1); - } - else - { - verts[1] = swizzleLane0(c); - } - - // vert 2 - if (primIndex < 14) - { - verts[2] = swizzleLaneN(b, primIndex + 2); - } - else - { - verts[2] = swizzleLaneN(c, primIndex - 14); - } -#else - const simdvector& a = PaGetSimdVector(pa, pa.first, slot); - const simdvector& b = PaGetSimdVector(pa, pa.prev, slot); - const simdvector& c = PaGetSimdVector(pa, pa.cur, slot); - - // vert 0 from leading vertex - verts[0] = swizzleLane0(a); - - // vert 1 - if (primIndex < 7) - { - verts[1] = swizzleLaneN(b, primIndex + 1); - } - else - { - verts[1] = swizzleLane0(c); - } - - // vert 2 - if (primIndex < 6) - { - verts[2] = swizzleLaneN(b, primIndex + 2); - } - else - { - verts[2] = swizzleLaneN(c, primIndex - 6); - } -#endif -} - -bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SetNextPaState(pa, PaQuadList1, PaQuadListSingle0); - return false; // Not enough vertices to assemble 8 triangles. -} - -bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ -#if USE_SIMD16_FRONTEND - simdvector a; - simdvector b; - - if (!pa.useAlternateOffset) - { - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 0); - b[i] = _simd16_extract_ps(a_16[i], 1); - } - } - else - { - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(b_16[i], 0); - b[i] = _simd16_extract_ps(b_16[i], 1); - } - } - -#else - simdvector& a = PaGetSimdVector(pa, 0, slot); - simdvector& b = PaGetSimdVector(pa, 1, slot); - -#endif - simdscalar s1, s2; - - for (int i = 0; i < 4; ++i) - { - simdscalar a0 = a[i]; - simdscalar b0 = b[i]; - - s1 = _mm256_permute2f128_ps(a0, b0, 0x20); - s2 = _mm256_permute2f128_ps(a0, b0, 0x31); - - simdvector& v0 = verts[0]; - v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0)); - - simdvector& v1 = verts[1]; - v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1)); - - simdvector& v2 = verts[2]; - v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2)); - } - - SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; -} - -#if ENABLE_AVX512_SIMD16 -bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0); - return false; // Not enough vertices to assemble 16 triangles. -} - -bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - // clang-format off - - const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); - - // v0 -> a0 a0 a4 a4 a8 a8 aC aC b0 b0 b0 b0 b0 b0 bC bC - // v1 -> a1 a2 a5 a6 a9 aA aD aE b1 b2 b5 b6 b9 bA bD bE - // v2 -> a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF - - simd16vector& v0 = verts[0]; - simd16vector& v1 = verts[1]; - simd16vector& v2 = verts[2]; - - // for simd16 x, y, z, and w - for (uint32_t i = 0; i < 4; i += 1) - { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); - - simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) = 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB - simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) = 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF - - v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC - v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1)); // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE - v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2)); // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF - } - - SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; - - // clang-format on -} - -#endif -void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) -{ -#if USE_SIMD16_FRONTEND - const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); - - if (pa.useAlternateOffset) - { - primIndex += KNOB_SIMD_WIDTH; - } - - switch (primIndex) - { - case 0: - // triangle 0 - 0 1 2 - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - verts[2] = swizzleLane2(a); - break; - case 1: - // triangle 1 - 0 2 3 - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane2(a); - verts[2] = swizzleLane3(a); - break; - case 2: - // triangle 2 - 4 5 6 - verts[0] = swizzleLane4(a); - verts[1] = swizzleLane5(a); - verts[2] = swizzleLane6(a); - break; - case 3: - // triangle 3 - 4 6 7 - verts[0] = swizzleLane4(a); - verts[1] = swizzleLane6(a); - verts[2] = swizzleLane7(a); - break; - case 4: - // triangle 4 - 8 9 A - verts[0] = swizzleLane8(a); - verts[1] = swizzleLane9(a); - verts[2] = swizzleLaneA(a); - break; - case 5: - // triangle 5 - 8 A B - verts[0] = swizzleLane8(a); - verts[1] = swizzleLaneA(a); - verts[2] = swizzleLaneB(a); - break; - case 6: - // triangle 6 - C D E - verts[0] = swizzleLaneC(a); - verts[1] = swizzleLaneD(a); - verts[2] = swizzleLaneE(a); - break; - case 7: - // triangle 7 - C E F - verts[0] = swizzleLaneC(a); - verts[1] = swizzleLaneE(a); - verts[2] = swizzleLaneF(a); - break; - case 8: - // triangle 0 - 0 1 2 - verts[0] = swizzleLane0(b); - verts[1] = swizzleLane1(b); - verts[2] = swizzleLane2(b); - break; - case 9: - // triangle 1 - 0 2 3 - verts[0] = swizzleLane0(b); - verts[1] = swizzleLane2(b); - verts[2] = swizzleLane3(b); - break; - case 10: - // triangle 2 - 4 5 6 - verts[0] = swizzleLane4(b); - verts[1] = swizzleLane5(b); - verts[2] = swizzleLane6(b); - break; - case 11: - // triangle 3 - 4 6 7 - verts[0] = swizzleLane4(b); - verts[1] = swizzleLane6(b); - verts[2] = swizzleLane7(b); - break; - case 12: - // triangle 4 - 8 9 A - verts[0] = swizzleLane8(b); - verts[1] = swizzleLane9(b); - verts[2] = swizzleLaneA(b); - break; - case 13: - // triangle 5 - 8 A B - verts[0] = swizzleLane8(b); - verts[1] = swizzleLaneA(b); - verts[2] = swizzleLaneB(b); - break; - case 14: - // triangle 6 - C D E - verts[0] = swizzleLaneC(b); - verts[1] = swizzleLaneD(b); - verts[2] = swizzleLaneE(b); - break; - case 15: - // triangle 7 - C E F - verts[0] = swizzleLaneC(b); - verts[1] = swizzleLaneE(b); - verts[2] = swizzleLaneF(b); - break; - } -#else - const simdvector& a = PaGetSimdVector(pa, 0, slot); - const simdvector& b = PaGetSimdVector(pa, 1, slot); - - switch (primIndex) - { - case 0: - // triangle 0 - 0 1 2 - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - verts[2] = swizzleLane2(a); - break; - case 1: - // triangle 1 - 0 2 3 - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane2(a); - verts[2] = swizzleLane3(a); - break; - case 2: - // triangle 2 - 4 5 6 - verts[0] = swizzleLane4(a); - verts[1] = swizzleLane5(a); - verts[2] = swizzleLane6(a); - break; - case 3: - // triangle 3 - 4 6 7 - verts[0] = swizzleLane4(a); - verts[1] = swizzleLane6(a); - verts[2] = swizzleLane7(a); - break; - case 4: - // triangle 4 - 8 9 10 (0 1 2) - verts[0] = swizzleLane0(b); - verts[1] = swizzleLane1(b); - verts[2] = swizzleLane2(b); - break; - case 5: - // triangle 1 - 0 2 3 - verts[0] = swizzleLane0(b); - verts[1] = swizzleLane2(b); - verts[2] = swizzleLane3(b); - break; - case 6: - // triangle 2 - 4 5 6 - verts[0] = swizzleLane4(b); - verts[1] = swizzleLane5(b); - verts[2] = swizzleLane6(b); - break; - case 7: - // triangle 3 - 4 6 7 - verts[0] = swizzleLane4(b); - verts[1] = swizzleLane6(b); - verts[2] = swizzleLane7(b); - break; - } -#endif -} - -bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0); - return false; -} - -bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - PaLineStrip1(pa, slot, verts); - - if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) - { - // loop reconnect now - const int lane = pa.numPrims - pa.numPrimsComplete - 1; - -#if USE_SIMD16_FRONTEND - simdvector first; - - const simd16vector& first_16 = PaGetSimdVector_simd16(pa, pa.first, slot); - - if (!pa.useAlternateOffset) - { - for (uint32_t i = 0; i < 4; i += 1) - { - first[i] = _simd16_extract_ps(first_16[i], 0); - } - } - else - { - for (uint32_t i = 0; i < 4; i += 1) - { - first[i] = _simd16_extract_ps(first_16[i], 1); - } - } - -#else - simdvector& first = PaGetSimdVector(pa, pa.first, slot); - -#endif - for (int i = 0; i < 4; i++) - { - float* firstVtx = (float*)&(first[i]); - float* targetVtx = (float*)&(verts[1][i]); - targetVtx[lane] = firstVtx[0]; - } - } - - SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); - return true; -} - -#if ENABLE_AVX512_SIMD16 -bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0); - return false; -} - -bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - PaLineStrip1_simd16(pa, slot, verts); - - if (pa.numPrimsComplete + KNOB_SIMD16_WIDTH > pa.numPrims - 1) - { - // loop reconnect now - const int lane = pa.numPrims - pa.numPrimsComplete - 1; - - const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot); - - for (int i = 0; i < 4; i++) - { - float* firstVtx = (float*)&(first[i]); - float* targetVtx = (float*)&(verts[1][i]); - targetVtx[lane] = firstVtx[0]; - } - } - - SetNextPaState_simd16( - pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); - return true; -} - -#endif -void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) -{ - PaLineStripSingle0(pa, slot, primIndex, verts); - - if (pa.numPrimsComplete + primIndex == pa.numPrims - 1) - { -#if USE_SIMD16_FRONTEND - const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot); - - verts[1] = swizzleLane0(first); -#else - const simdvector& first = PaGetSimdVector(pa, pa.first, slot); - - verts[1] = swizzleLane0(first); -#endif - } -} - -bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SetNextPaState(pa, PaLineList1, PaLineListSingle0); - return false; // Not enough vertices to assemble 8 lines -} - -bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ -#if USE_SIMD16_FRONTEND - simdvector a; - simdvector b; - - if (!pa.useAlternateOffset) - { - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 0); - b[i] = _simd16_extract_ps(a_16[i], 1); - } - } - else - { - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(b_16[i], 0); - b[i] = _simd16_extract_ps(b_16[i], 1); - } - } - -#else - simdvector& a = PaGetSimdVector(pa, 0, slot); - simdvector& b = PaGetSimdVector(pa, 1, slot); - -#endif - /// @todo: verify provoking vertex is correct - // Line list 0 1 2 3 4 5 6 7 - // 8 9 10 11 12 13 14 15 - - // shuffle: - // 0 2 4 6 8 10 12 14 - // 1 3 5 7 9 11 13 15 - - for (uint32_t i = 0; i < 4; ++i) - { - // 0 1 2 3 8 9 10 11 - __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20); - // 4 5 6 7 12 13 14 15 - __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31); - - // 0 2 4 6 8 10 12 14 - verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0)); - // 1 3 5 7 9 11 13 15 - verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1)); - } - - SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; -} - -#if ENABLE_AVX512_SIMD16 -bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0); - return false; // Not enough vertices to assemble 16 lines -} - -bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - // clang-format off - - const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); - - // v0 -> a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE - // v1 -> a1 a3 a5 a7 a9 aB aD aF b1 b3 b4 b7 b9 bB bD bF - - simd16vector& v0 = verts[0]; - simd16vector& v1 = verts[1]; - - // for simd16 x, y, z, and w - for (int i = 0; i < 4; i += 1) - { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); - - simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB - simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF - - v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE - v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF - } - - SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; - - // clang-format on -} - -#endif -void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) -{ -#if USE_SIMD16_FRONTEND - const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot); - - if (pa.useAlternateOffset) - { - primIndex += KNOB_SIMD_WIDTH; - } - - switch (primIndex) - { - case 0: - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - break; - case 1: - verts[0] = swizzleLane2(a); - verts[1] = swizzleLane3(a); - break; - case 2: - verts[0] = swizzleLane4(a); - verts[1] = swizzleLane5(a); - break; - case 3: - verts[0] = swizzleLane6(a); - verts[1] = swizzleLane7(a); - break; - case 4: - verts[0] = swizzleLane8(a); - verts[1] = swizzleLane9(a); - break; - case 5: - verts[0] = swizzleLaneA(a); - verts[1] = swizzleLaneB(a); - break; - case 6: - verts[0] = swizzleLaneC(a); - verts[1] = swizzleLaneD(a); - break; - case 7: - verts[0] = swizzleLaneE(a); - verts[1] = swizzleLaneF(a); - break; - case 8: - verts[0] = swizzleLane0(b); - verts[1] = swizzleLane1(b); - break; - case 9: - verts[0] = swizzleLane2(b); - verts[1] = swizzleLane3(b); - break; - case 10: - verts[0] = swizzleLane4(b); - verts[1] = swizzleLane5(b); - break; - case 11: - verts[0] = swizzleLane6(b); - verts[1] = swizzleLane7(b); - break; - case 12: - verts[0] = swizzleLane8(b); - verts[1] = swizzleLane9(b); - break; - case 13: - verts[0] = swizzleLaneA(b); - verts[1] = swizzleLaneB(b); - break; - case 14: - verts[0] = swizzleLaneC(b); - verts[1] = swizzleLaneD(b); - break; - case 15: - verts[0] = swizzleLaneE(b); - verts[1] = swizzleLaneF(b); - break; - } -#else - const simdvector& a = PaGetSimdVector(pa, 0, slot); - const simdvector& b = PaGetSimdVector(pa, 1, slot); - - switch (primIndex) - { - case 0: - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - break; - case 1: - verts[0] = swizzleLane2(a); - verts[1] = swizzleLane3(a); - break; - case 2: - verts[0] = swizzleLane4(a); - verts[1] = swizzleLane5(a); - break; - case 3: - verts[0] = swizzleLane6(a); - verts[1] = swizzleLane7(a); - break; - case 4: - verts[0] = swizzleLane0(b); - verts[1] = swizzleLane1(b); - break; - case 5: - verts[0] = swizzleLane2(b); - verts[1] = swizzleLane3(b); - break; - case 6: - verts[0] = swizzleLane4(b); - verts[1] = swizzleLane5(b); - break; - case 7: - verts[0] = swizzleLane6(b); - verts[1] = swizzleLane7(b); - break; - } -#endif -} - -bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0); - return false; // Not enough vertices to assemble 8 lines -} - -bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ -#if USE_SIMD16_FRONTEND - simdvector a; - simdvector b; - - if (!pa.useAlternateOffset) - { - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 0); - b[i] = _simd16_extract_ps(a_16[i], 1); - } - } - else - { - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(b_16[i], 0); - b[i] = _simd16_extract_ps(b_16[i], 1); - } - } - -#else - simdvector& a = PaGetSimdVector(pa, pa.prev, slot); - simdvector& b = PaGetSimdVector(pa, pa.cur, slot); - -#endif - /// @todo: verify provoking vertex is correct - // Line list 0 1 2 3 4 5 6 7 - // 8 9 10 11 12 13 14 15 - - // shuffle: - // 0 1 2 3 4 5 6 7 - // 1 2 3 4 5 6 7 8 - - verts[0] = a; - - for (uint32_t i = 0; i < 4; ++i) - { - // 1 2 3 x 5 6 7 x - __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1) - // 4 5 6 7 8 9 10 11 - __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21); - - // x x x 4 x x x 8 - __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low (0 0 0 0) - - verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88); - } - - SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); - return true; -} - -#if ENABLE_AVX512_SIMD16 -bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0); - return false; // Not enough vertices to assemble 16 lines -} - -bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - // clang-format off - - const simd16scalari perm = _simd16_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - - const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot); - - const simd16mask mask0 = 0x0001; - - // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF - // v1 -> a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 - - simd16vector& v0 = verts[0]; - simd16vector& v1 = verts[1]; - - v0 = a; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF - - // for simd16 x, y, z, and w - for (int i = 0; i < 4; i += 1) - { - simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i])); - simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i])); - - simd16scalar temp = _simd16_blend_ps(tempa, tempb, mask0); // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF - - v1[i] = _simd16_permute_ps(temp, perm); // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 - } - - SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); - return true; - - // clang-format on -} - -#endif -void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) -{ -#if USE_SIMD16_FRONTEND - const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot); - const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot); - - if (pa.useAlternateOffset) - { - primIndex += KNOB_SIMD_WIDTH; - } - - switch (primIndex) - { - case 0: - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - break; - case 1: - verts[0] = swizzleLane1(a); - verts[1] = swizzleLane2(a); - break; - case 2: - verts[0] = swizzleLane2(a); - verts[1] = swizzleLane3(a); - break; - case 3: - verts[0] = swizzleLane3(a); - verts[1] = swizzleLane4(a); - break; - case 4: - verts[0] = swizzleLane4(a); - verts[1] = swizzleLane5(a); - break; - case 5: - verts[0] = swizzleLane5(a); - verts[1] = swizzleLane6(a); - break; - case 6: - verts[0] = swizzleLane6(a); - verts[1] = swizzleLane7(a); - break; - case 7: - verts[0] = swizzleLane7(a); - verts[1] = swizzleLane8(a); - break; - case 8: - verts[0] = swizzleLane8(a); - verts[1] = swizzleLane9(a); - break; - case 9: - verts[0] = swizzleLane9(a); - verts[1] = swizzleLaneA(a); - break; - case 10: - verts[0] = swizzleLaneA(a); - verts[1] = swizzleLaneB(a); - break; - case 11: - verts[0] = swizzleLaneB(a); - verts[1] = swizzleLaneC(a); - break; - case 12: - verts[0] = swizzleLaneC(a); - verts[1] = swizzleLaneD(a); - break; - case 13: - verts[0] = swizzleLaneD(a); - verts[1] = swizzleLaneE(a); - break; - case 14: - verts[0] = swizzleLaneE(a); - verts[1] = swizzleLaneF(a); - break; - case 15: - verts[0] = swizzleLaneF(a); - verts[1] = swizzleLane0(b); - break; - } -#else - const simdvector& a = PaGetSimdVector(pa, pa.prev, slot); - const simdvector& b = PaGetSimdVector(pa, pa.cur, slot); - - switch (primIndex) - { - case 0: - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - break; - case 1: - verts[0] = swizzleLane1(a); - verts[1] = swizzleLane2(a); - break; - case 2: - verts[0] = swizzleLane2(a); - verts[1] = swizzleLane3(a); - break; - case 3: - verts[0] = swizzleLane3(a); - verts[1] = swizzleLane4(a); - break; - case 4: - verts[0] = swizzleLane4(a); - verts[1] = swizzleLane5(a); - break; - case 5: - verts[0] = swizzleLane5(a); - verts[1] = swizzleLane6(a); - break; - case 6: - verts[0] = swizzleLane6(a); - verts[1] = swizzleLane7(a); - break; - case 7: - verts[0] = swizzleLane7(a); - verts[1] = swizzleLane0(b); - break; - } -#endif -} - -bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ -#if USE_SIMD16_FRONTEND - simdvector a; - - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); - - if (!pa.useAlternateOffset) - { - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 0); - } - } - else - { - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 1); - } - } - -#else - simdvector& a = PaGetSimdVector(pa, 0, slot); - -#endif - verts[0] = a; // points only have 1 vertex. - - SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; -} - -#if ENABLE_AVX512_SIMD16 -bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - simd16vector& a = PaGetSimdVector_simd16(pa, pa.cur, slot); - - verts[0] = a; // points only have 1 vertex. - - SetNextPaState_simd16( - pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; -} - -#endif -void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) -{ -#if USE_SIMD16_FRONTEND - const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot); - - if (pa.useAlternateOffset) - { - primIndex += KNOB_SIMD_WIDTH; - } - - verts[0] = swizzleLaneN(a, primIndex); -#else - const simdvector& a = PaGetSimdVector(pa, 0, slot); - - verts[0] = swizzleLaneN(a, primIndex); -#endif -} - -////////////////////////////////////////////////////////////////////////// -/// @brief State 1 for RECT_LIST topology. -/// There is not enough to assemble 8 triangles. -bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SetNextPaState(pa, PaRectList1, PaRectListSingle0); - return false; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief State 1 for RECT_LIST topology. -/// Rect lists has the following format. -/// w x y z -/// v2 o---o v5 o---o v8 o---o v11 o---o -/// | \ | | \ | | \ | | \ | -/// v1 o---o v4 o---o v7 o---o v10 o---o -/// v0 v3 v6 v9 -/// -/// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied. -/// -/// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2 -/// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5 -/// etc. -/// -/// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2 -/// where v0 contains all the first vertices for 8 triangles. -/// -/// Result: -/// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 } -/// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 } -/// verts[2] = { v2, w, v5, x, v8, y, v11, z } -/// -/// @param pa - State for PA state machine. -/// @param slot - Index into VS output which is either a position (slot 0) or attribute. -/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, -/// etc. -bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ -// SIMD vectors a and b are the last two vertical outputs from the vertex shader. -#if USE_SIMD16_FRONTEND - simdvector a; - simdvector b; - - if (!pa.useAlternateOffset) - { - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 0); - b[i] = _simd16_extract_ps(a_16[i], 1); - } - } - else - { - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(b_16[i], 0); - b[i] = _simd16_extract_ps(b_16[i], 1); - ; - } - } - -#else - simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 } - simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 } - -#endif - __m256 tmp0, tmp1, tmp2; - - // Loop over each component in the simdvector. - for (int i = 0; i < 4; ++i) - { - simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } - tmp0 = _mm256_permute2f128_ps( - b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } - v0[i] = _mm256_blend_ps( - a[i], - tmp0, - 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. - tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } - v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } - v0[i] = - _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } - - /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. - /// AVX2 should make this much cheaper. - simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } - v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } - tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } - tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } - tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, *, *, *, * } - v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } - v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } - v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } - - // verts[2] = { v2, w, v5, x, v8, y, v11, z } - simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } - v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } - tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } - v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0); - - // Need to compute 4th implied vertex for the rectangle. - tmp2 = _mm256_sub_ps(v0[i], v1[i]); - tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * } - tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } - v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } - } - - SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief State 2 for RECT_LIST topology. -/// Not implemented unless there is a use case for more then 8 rects. -/// @param pa - State for PA state machine. -/// @param slot - Index into VS output which is either a position (slot 0) or attribute. -/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, -/// etc. -bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -{ - SWR_INVALID("Is rect list used for anything other then clears?"); - SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; -} - -#if ENABLE_AVX512_SIMD16 -////////////////////////////////////////////////////////////////////////// -/// @brief State 1 for RECT_LIST topology. -/// There is not enough to assemble 8 triangles. -bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0); - return false; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief State 1 for RECT_LIST topology. -/// Rect lists has the following format. -/// w x y z -/// v2 o---o v5 o---o v8 o---o v11 o---o -/// | \ | | \ | | \ | | \ | -/// v1 o---o v4 o---o v7 o---o v10 o---o -/// v0 v3 v6 v9 -/// -/// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied. -/// -/// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2 -/// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5 -/// etc. -/// -/// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2 -/// where v0 contains all the first vertices for 8 triangles. -/// -/// Result: -/// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 } -/// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 } -/// verts[2] = { v2, w, v5, x, v8, y, v11, z } -/// -/// @param pa - State for PA state machine. -/// @param slot - Index into VS output which is either a position (slot 0) or attribute. -/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, -/// etc. -bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - // clang-format off - - simdvector a; - simdvector b; - - if (!pa.useAlternateOffset) - { - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7, - // v8, v9, v10, v11, v12, v13, v14, v15 } - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 0); - b[i] = _simd16_extract_ps(a_16[i], 1); - } - } - else - { - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. } - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(b_16[i], 0); - b[i] = _simd16_extract_ps(b_16[i], 1); - } - } - - simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } - simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } - simd16vector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } - - // Loop over each component in the simdvector. - for (int i = 0; i < 4; i += 1) - { - simdscalar v0_lo; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } - simdscalar v1_lo; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } - simdscalar v2_lo; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } - - __m256 tmp0, tmp1, tmp2; - - tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } - v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. - tmp1 = _mm256_permute_ps(v0_lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } - v0_lo = _mm256_permute_ps(v0_lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } - v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } - - /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. - /// AVX2 should make this much cheaper. - v1_lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } - tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } - tmp2 = _mm256_blend_ps(v1_lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } - tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, *, *, *, * } - v1_lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } - v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } - v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } - - // verts[2] = { v2, w, v5, x, v8, y, v11, z } - v2_lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } - tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } - v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0); - - // Need to compute 4th implied vertex for the rectangle. - tmp2 = _mm256_sub_ps(v0_lo, v1_lo); - tmp2 = _mm256_add_ps(tmp2, v2_lo); // tmp2 = { w, *, x, *, y, *, z, * } - tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } - v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } - - v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0); - v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0); - v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0); - } - - SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; - - // clang-format on -} - -////////////////////////////////////////////////////////////////////////// -/// @brief State 2 for RECT_LIST topology. -/// Not implemented unless there is a use case for more then 8 rects. -/// @param pa - State for PA state machine. -/// @param slot - Index into VS output which is either a position (slot 0) or attribute. -/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, -/// etc. -bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) -{ - SWR_INVALID("Is rect list used for anything other then clears?"); - SetNextPaState_simd16( - pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); - return true; -} - -#endif -////////////////////////////////////////////////////////////////////////// -/// @brief This procedure is called by the Binner to assemble the attributes. -/// Unlike position, which is stored vertically, the attributes are -/// stored horizontally. The outputs from the VS, labeled as 'a' and -/// 'b' are vertical. This function needs to transpose the lanes -/// containing the vertical attribute data into horizontal form. -/// @param pa - State for PA state machine. -/// @param slot - Index into VS output for a given attribute. -/// @param primIndex - Binner processes each triangle individually. -/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, -/// etc. -void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]) -{ -// We have 12 simdscalars contained within 3 simdvectors which -// hold at least 8 triangles worth of data. We want to assemble a single -// triangle with data in horizontal form. -#if USE_SIMD16_FRONTEND - simdvector a; - simdvector b; - - if (!pa.useAlternateOffset) - { - const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(a_16[i], 0); - b[i] = _simd16_extract_ps(a_16[i], 1); - } - } - else - { - const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i] = _simd16_extract_ps(b_16[i], 0); - b[i] = _simd16_extract_ps(b_16[i], 1); - ; - } - } - -#else - simdvector& a = PaGetSimdVector(pa, 0, slot); - -#endif - // Convert from vertical to horizontal. - switch (primIndex) - { - case 0: - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane1(a); - verts[2] = swizzleLane2(a); - break; - case 1: - verts[0] = swizzleLane0(a); - verts[1] = swizzleLane2(a); - verts[2] = _mm_blend_ps(verts[0], verts[1], 0xA); - break; - case 2: - case 3: - case 4: - case 5: - case 6: - case 7: - SWR_INVALID("Invalid primIndex: %d", primIndex); - break; - }; -} - -PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT* in_pDC, - uint32_t in_numPrims, - uint8_t* pStream, - uint32_t in_streamSizeInVerts, - uint32_t in_vertexStride, - bool in_isStreaming, - uint32_t numVertsPerPrim, - PRIMITIVE_TOPOLOGY topo) : - PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim), - numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), cur(0), prev(0), first(0), - counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming) -{ - const API_STATE& state = GetApiState(pDC); - - this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo; - -#if ENABLE_AVX512_SIMD16 - pfnPaFunc_simd16 = nullptr; - -#endif - switch (this->binTopology) - { - case TOP_TRIANGLE_LIST: - this->pfnPaFunc = PaTriList0; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaTriList0_simd16; -#endif - break; - case TOP_TRIANGLE_STRIP: - this->pfnPaFunc = PaTriStrip0; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaTriStrip0_simd16; -#endif - break; - case TOP_TRIANGLE_FAN: - this->pfnPaFunc = PaTriFan0; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaTriFan0_simd16; -#endif - break; - case TOP_QUAD_LIST: - this->pfnPaFunc = PaQuadList0; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaQuadList0_simd16; -#endif - this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles - break; - case TOP_QUAD_STRIP: - // quad strip pattern when decomposed into triangles is the same as verts strips - this->pfnPaFunc = PaTriStrip0; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaTriStrip0_simd16; -#endif - this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles - break; - case TOP_LINE_LIST: - this->pfnPaFunc = PaLineList0; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaLineList0_simd16; -#endif - this->numPrims = in_numPrims; - break; - case TOP_LINE_STRIP: - this->pfnPaFunc = PaLineStrip0; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaLineStrip0_simd16; -#endif - this->numPrims = in_numPrims; - break; - case TOP_LINE_LOOP: - this->pfnPaFunc = PaLineLoop0; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaLineLoop0_simd16; -#endif - this->numPrims = in_numPrims; - break; - case TOP_POINT_LIST: - this->pfnPaFunc = PaPoints0; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPoints0_simd16; -#endif - this->numPrims = in_numPrims; - break; - case TOP_RECT_LIST: - this->pfnPaFunc = PaRectList0; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaRectList0_simd16; -#endif - this->numPrims = in_numPrims * 2; - break; - - case TOP_PATCHLIST_1: - this->pfnPaFunc = PaPatchList<1>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<1>; -#endif - break; - case TOP_PATCHLIST_2: - this->pfnPaFunc = PaPatchList<2>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<2>; -#endif - break; - case TOP_PATCHLIST_3: - this->pfnPaFunc = PaPatchList<3>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<3>; -#endif - break; - case TOP_PATCHLIST_4: - this->pfnPaFunc = PaPatchList<4>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<4>; -#endif - break; - case TOP_PATCHLIST_5: - this->pfnPaFunc = PaPatchList<5>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<5>; -#endif - break; - case TOP_PATCHLIST_6: - this->pfnPaFunc = PaPatchList<6>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<6>; -#endif - break; - case TOP_PATCHLIST_7: - this->pfnPaFunc = PaPatchList<7>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<7>; -#endif - break; - case TOP_PATCHLIST_8: - this->pfnPaFunc = PaPatchList<8>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<8>; -#endif - break; - case TOP_PATCHLIST_9: - this->pfnPaFunc = PaPatchList<9>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<9>; -#endif - break; - case TOP_PATCHLIST_10: - this->pfnPaFunc = PaPatchList<10>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<10>; -#endif - break; - case TOP_PATCHLIST_11: - this->pfnPaFunc = PaPatchList<11>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<11>; -#endif - break; - case TOP_PATCHLIST_12: - this->pfnPaFunc = PaPatchList<12>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<12>; -#endif - break; - case TOP_PATCHLIST_13: - this->pfnPaFunc = PaPatchList<13>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<13>; -#endif - break; - case TOP_PATCHLIST_14: - this->pfnPaFunc = PaPatchList<14>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<14>; -#endif - break; - case TOP_PATCHLIST_15: - this->pfnPaFunc = PaPatchList<15>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<15>; -#endif - break; - case TOP_PATCHLIST_16: - this->pfnPaFunc = PaPatchList<16>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<16>; -#endif - break; - case TOP_PATCHLIST_17: - this->pfnPaFunc = PaPatchList<17>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<17>; -#endif - break; - case TOP_PATCHLIST_18: - this->pfnPaFunc = PaPatchList<18>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<18>; -#endif - break; - case TOP_PATCHLIST_19: - this->pfnPaFunc = PaPatchList<19>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<19>; -#endif - break; - case TOP_PATCHLIST_20: - this->pfnPaFunc = PaPatchList<20>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<20>; -#endif - break; - case TOP_PATCHLIST_21: - this->pfnPaFunc = PaPatchList<21>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<21>; -#endif - break; - case TOP_PATCHLIST_22: - this->pfnPaFunc = PaPatchList<22>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<22>; -#endif - break; - case TOP_PATCHLIST_23: - this->pfnPaFunc = PaPatchList<23>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<23>; -#endif - break; - case TOP_PATCHLIST_24: - this->pfnPaFunc = PaPatchList<24>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<24>; -#endif - break; - case TOP_PATCHLIST_25: - this->pfnPaFunc = PaPatchList<25>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<25>; -#endif - break; - case TOP_PATCHLIST_26: - this->pfnPaFunc = PaPatchList<26>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<26>; -#endif - break; - case TOP_PATCHLIST_27: - this->pfnPaFunc = PaPatchList<27>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<27>; -#endif - break; - case TOP_PATCHLIST_28: - this->pfnPaFunc = PaPatchList<28>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<28>; -#endif - break; - case TOP_PATCHLIST_29: - this->pfnPaFunc = PaPatchList<29>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<29>; -#endif - break; - case TOP_PATCHLIST_30: - this->pfnPaFunc = PaPatchList<30>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<30>; -#endif - break; - case TOP_PATCHLIST_31: - this->pfnPaFunc = PaPatchList<31>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<31>; -#endif - break; - case TOP_PATCHLIST_32: - this->pfnPaFunc = PaPatchList<32>; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFunc_simd16 = PaPatchList_simd16<32>; -#endif - break; - - default: - SWR_INVALID("Invalid topology: %d", this->binTopology); - break; - }; - - this->pfnPaFuncReset = this->pfnPaFunc; -#if ENABLE_AVX512_SIMD16 - this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16; -#endif - -#if USE_SIMD16_FRONTEND - simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - simd16scalari id82 = _simd16_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - -#else - simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); - -#endif - switch (this->binTopology) - { - case TOP_TRIANGLE_LIST: - case TOP_TRIANGLE_STRIP: - case TOP_TRIANGLE_FAN: - case TOP_LINE_STRIP: - case TOP_LINE_LIST: - case TOP_LINE_LOOP: -#if USE_SIMD16_FRONTEND - this->primIDIncr = 16; - this->primID = id16; -#else - this->primIDIncr = 8; - this->primID = id8; -#endif - break; - case TOP_QUAD_LIST: - case TOP_QUAD_STRIP: - case TOP_RECT_LIST: -#if USE_SIMD16_FRONTEND - this->primIDIncr = 8; - this->primID = id82; -#else - this->primIDIncr = 4; - this->primID = id4; -#endif - break; - case TOP_POINT_LIST: -#if USE_SIMD16_FRONTEND - this->primIDIncr = 16; - this->primID = id16; -#else - this->primIDIncr = 8; - this->primID = id8; -#endif - break; - case TOP_PATCHLIST_1: - case TOP_PATCHLIST_2: - case TOP_PATCHLIST_3: - case TOP_PATCHLIST_4: - case TOP_PATCHLIST_5: - case TOP_PATCHLIST_6: - case TOP_PATCHLIST_7: - case TOP_PATCHLIST_8: - case TOP_PATCHLIST_9: - case TOP_PATCHLIST_10: - case TOP_PATCHLIST_11: - case TOP_PATCHLIST_12: - case TOP_PATCHLIST_13: - case TOP_PATCHLIST_14: - case TOP_PATCHLIST_15: - case TOP_PATCHLIST_16: - case TOP_PATCHLIST_17: - case TOP_PATCHLIST_18: - case TOP_PATCHLIST_19: - case TOP_PATCHLIST_20: - case TOP_PATCHLIST_21: - case TOP_PATCHLIST_22: - case TOP_PATCHLIST_23: - case TOP_PATCHLIST_24: - case TOP_PATCHLIST_25: - case TOP_PATCHLIST_26: - case TOP_PATCHLIST_27: - case TOP_PATCHLIST_28: - case TOP_PATCHLIST_29: - case TOP_PATCHLIST_30: - case TOP_PATCHLIST_31: - case TOP_PATCHLIST_32: - // Always run KNOB_SIMD_WIDTH number of patches at a time. -#if USE_SIMD16_FRONTEND - this->primIDIncr = 16; - this->primID = id16; -#else - this->primIDIncr = 8; - this->primID = id8; -#endif - break; - - default: - SWR_INVALID("Invalid topology: %d", this->binTopology); - break; - }; -} -#endif diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp deleted file mode 100644 index c14cd56e52e..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ /dev/null @@ -1,473 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file rasterizer.cpp - * - * @brief Implementation for the rasterizer. - * - ******************************************************************************/ - -#include <vector> -#include <algorithm> - -#include "rasterizer.h" -#include "backends/gen_rasterizer.hpp" -#include "rdtsc_core.h" -#include "backend.h" -#include "utils.h" -#include "frontend.h" -#include "tilemgr.h" -#include "memory/tilingtraits.h" -#include "rasterizer_impl.h" - -PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT] - [STATE_VALID_TRI_EDGE_COUNT][2]; - -void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData) -{ - const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pData); -#if KNOB_ENABLE_TOSS_POINTS - if (KNOB_TOSS_BIN_TRIS) - { - return; - } -#endif - - // bloat line to two tris and call the triangle rasterizer twice - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - - // macrotile dimensioning - uint32_t macroX, macroY; - MacroTileMgr::getTileIndices(macroTile, macroX, macroY); - int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; - int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; - int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; - int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; - - const SWR_RECT& scissorInFixedPoint = - state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; - - // create a copy of the triangle buffer to write our adjusted vertices to - OSALIGNSIMD(float) newTriBuffer[4 * 4]; - TRIANGLE_WORK_DESC newWorkDesc = workDesc; - newWorkDesc.pTriBuffer = &newTriBuffer[0]; - - // create a copy of the attrib buffer to write our adjusted attribs to - OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS]; - newWorkDesc.pAttribs = &newAttribBuffer[0]; - - const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f); - const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f); - - __m128 vX, vY, vZ, vRecipW; - - vX = _mm_load_ps(workDesc.pTriBuffer); - vY = _mm_load_ps(workDesc.pTriBuffer + 4); - vZ = _mm_load_ps(workDesc.pTriBuffer + 8); - vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); - - // triangle 0 - // v0,v1 -> v0,v0,v1 - __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0)); - __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0)); - __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0)); - __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0)); - - __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth); - __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0); - if (workDesc.triFlags.yMajor) - { - vXa = _mm_add_ps(vAdjust, vXa); - } - else - { - vYa = _mm_add_ps(vAdjust, vYa); - } - - // Store triangle description for rasterizer - _mm_store_ps((float*)&newTriBuffer[0], vXa); - _mm_store_ps((float*)&newTriBuffer[4], vYa); - _mm_store_ps((float*)&newTriBuffer[8], vZa); - _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); - - // binner bins 3 edges for lines as v0, v1, v1 - // tri0 needs v0, v0, v1 - for (uint32_t a = 0; a < workDesc.numAttribs; ++a) - { - __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]); - __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]); - - _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0); - _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0); - _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1); - } - - // Store user clip distances for triangle 0 - float newClipBuffer[3 * 8]; - uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask); - if (numClipDist) - { - newWorkDesc.pUserClipBuffer = newClipBuffer; - - float* pOldBuffer = workDesc.pUserClipBuffer; - float* pNewBuffer = newClipBuffer; - for (uint32_t i = 0; i < numClipDist; ++i) - { - // read barycentric coeffs from binner - float a = *(pOldBuffer++); - float b = *(pOldBuffer++); - - // reconstruct original clip distance at vertices - float c0 = a + b; - float c1 = b; - - // construct triangle barycentrics - *(pNewBuffer++) = c0 - c1; - *(pNewBuffer++) = c0 - c1; - *(pNewBuffer++) = c1; - } - } - - // setup triangle rasterizer function - PFN_WORK_FUNC pfnTriRast; - // conservative rast not supported for points/lines - pfnTriRast = GetRasterizerFunc(rastState.sampleCount, - rastState.bIsCenterPattern, - false, - SWR_INPUT_COVERAGE_NONE, - EdgeValToEdgeState(ALL_EDGES_VALID), - (pDC->pState->state.scissorsTileAligned == false)); - - // make sure this macrotile intersects the triangle - __m128i vXai = fpToFixedPoint(vXa); - __m128i vYai = fpToFixedPoint(vYa); - OSALIGNSIMD(SWR_RECT) bboxA; - calcBoundingBoxInt(vXai, vYai, bboxA); - - if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax || - bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin || - bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax || - bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin)) - { - // rasterize triangle - pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); - } - - // triangle 1 - // v0,v1 -> v1,v1,v0 - vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1)); - vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1)); - vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1)); - vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1)); - - vAdjust = _mm_mul_ps(vLineWidth, vBloat1); - if (workDesc.triFlags.yMajor) - { - vXa = _mm_add_ps(vAdjust, vXa); - } - else - { - vYa = _mm_add_ps(vAdjust, vYa); - } - - // Store triangle description for rasterizer - _mm_store_ps((float*)&newTriBuffer[0], vXa); - _mm_store_ps((float*)&newTriBuffer[4], vYa); - _mm_store_ps((float*)&newTriBuffer[8], vZa); - _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); - - // binner bins 3 edges for lines as v0, v1, v1 - // tri1 needs v1, v1, v0 - for (uint32_t a = 0; a < workDesc.numAttribs; ++a) - { - __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]); - __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]); - - _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1); - _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1); - _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0); - } - - // store user clip distance for triangle 1 - if (numClipDist) - { - float* pOldBuffer = workDesc.pUserClipBuffer; - float* pNewBuffer = newClipBuffer; - for (uint32_t i = 0; i < numClipDist; ++i) - { - // read barycentric coeffs from binner - float a = *(pOldBuffer++); - float b = *(pOldBuffer++); - - // reconstruct original clip distance at vertices - float c0 = a + b; - float c1 = b; - - // construct triangle barycentrics - *(pNewBuffer++) = c1 - c0; - *(pNewBuffer++) = c1 - c0; - *(pNewBuffer++) = c0; - } - } - - vXai = fpToFixedPoint(vXa); - vYai = fpToFixedPoint(vYa); - calcBoundingBoxInt(vXai, vYai, bboxA); - - if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax || - bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin || - bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax || - bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin)) - { - // rasterize triangle - pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); - } - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, 1); -} - -void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData) -{ -#if KNOB_ENABLE_TOSS_POINTS - if (KNOB_TOSS_BIN_TRIS) - { - return; - } -#endif - - const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; - const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; - - // map x,y relative offsets from start of raster tile to bit position in - // coverage mask for the point - static const uint32_t coverageMap[8][8] = {{0, 1, 4, 5, 8, 9, 12, 13}, - {2, 3, 6, 7, 10, 11, 14, 15}, - {16, 17, 20, 21, 24, 25, 28, 29}, - {18, 19, 22, 23, 26, 27, 30, 31}, - {32, 33, 36, 37, 40, 41, 44, 45}, - {34, 35, 38, 39, 42, 43, 46, 47}, - {48, 49, 52, 53, 56, 57, 60, 61}, - {50, 51, 54, 55, 58, 59, 62, 63}}; - - OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc = {}; - - // pull point information from triangle buffer - // @todo use structs for readability - uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer; - uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1); - float z = *(workDesc.pTriBuffer + 2); - - // construct triangle descriptor for point - // no interpolation, set up i,j for constant interpolation of z and attribs - // @todo implement an optimized backend that doesn't require triangle information - - // compute coverage mask from x,y packed into the coverageMask flag - // mask indices by the maximum valid index for x/y of coveragemap. - uint32_t tX = workDesc.triFlags.coverageMask & 0x7; - uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7; - for (uint32_t i = 0; i < _countof(triDesc.coverageMask); ++i) - { - triDesc.coverageMask[i] = 1ULL << coverageMap[tY][tX]; - } - triDesc.anyCoveredSamples = triDesc.coverageMask[0]; - triDesc.innerCoverageMask = triDesc.coverageMask[0]; - - // no persp divide needed for points - triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs; - triDesc.triFlags = workDesc.triFlags; - triDesc.recipDet = 1.0f; - triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f; - triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f; - triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f; - triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z; - - RenderOutputBuffers renderBuffers; - GetRenderHotTiles(pDC, - workerId, - macroTile, - tileAlignedX >> KNOB_TILE_X_DIM_SHIFT, - tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, - renderBuffers, - triDesc.triFlags.renderTargetArrayIndex); - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId); - backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers); - RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0); -} - -void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData) -{ - const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; - const SWR_RASTSTATE& rastState = pDC->pState->state.rastState; - const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; - - bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0; - - // load point vertex - float x = *workDesc.pTriBuffer; - float y = *(workDesc.pTriBuffer + 1); - float z = *(workDesc.pTriBuffer + 2); - - // create a copy of the triangle buffer to write our adjusted vertices to - OSALIGNSIMD(float) newTriBuffer[4 * 4]; - TRIANGLE_WORK_DESC newWorkDesc = workDesc; - newWorkDesc.pTriBuffer = &newTriBuffer[0]; - - // create a copy of the attrib buffer to write our adjusted attribs to - OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS]; - newWorkDesc.pAttribs = &newAttribBuffer[0]; - - newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer; - newWorkDesc.numAttribs = workDesc.numAttribs; - newWorkDesc.triFlags = workDesc.triFlags; - - // construct two tris by bloating point by point size - float halfPointSize = workDesc.triFlags.pointSize * 0.5f; - float lowerX = x - halfPointSize; - float upperX = x + halfPointSize; - float lowerY = y - halfPointSize; - float upperY = y + halfPointSize; - - // tri 0 - float* pBuf = &newTriBuffer[0]; - *pBuf++ = lowerX; - *pBuf++ = lowerX; - *pBuf++ = upperX; - pBuf++; - *pBuf++ = lowerY; - *pBuf++ = upperY; - *pBuf++ = upperY; - pBuf++; - _mm_store_ps(pBuf, _mm_set1_ps(z)); - _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f)); - - // setup triangle rasterizer function - PFN_WORK_FUNC pfnTriRast; - // conservative rast not supported for points/lines - pfnTriRast = GetRasterizerFunc(rastState.sampleCount, - rastState.bIsCenterPattern, - false, - SWR_INPUT_COVERAGE_NONE, - EdgeValToEdgeState(ALL_EDGES_VALID), - (pDC->pState->state.scissorsTileAligned == false)); - - // overwrite texcoords for point sprites - if (isPointSpriteTexCoordEnabled) - { - // copy original attribs - memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float)); - newWorkDesc.pAttribs = &newAttribBuffer[0]; - - // overwrite texcoord for point sprites - uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; - unsigned long texCoordAttrib = 0; - - while (_BitScanForward(&texCoordAttrib, texCoordMask)) - { - texCoordMask &= ~(1 << texCoordAttrib); - __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib; - if (rastState.pointSpriteTopOrigin) - { - pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0); - pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0); - pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1); - } - else - { - pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0); - pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0); - pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1); - } - } - } - else - { - // no texcoord overwrite, can reuse the attrib buffer from frontend - newWorkDesc.pAttribs = workDesc.pAttribs; - } - - pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); - - // tri 1 - pBuf = &newTriBuffer[0]; - *pBuf++ = lowerX; - *pBuf++ = upperX; - *pBuf++ = upperX; - pBuf++; - *pBuf++ = lowerY; - *pBuf++ = upperY; - *pBuf++ = lowerY; - // z, w unchanged - - if (isPointSpriteTexCoordEnabled) - { - uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; - unsigned long texCoordAttrib = 0; - - while (_BitScanForward(&texCoordAttrib, texCoordMask)) - { - texCoordMask &= ~(1 << texCoordAttrib); - __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib; - if (rastState.pointSpriteTopOrigin) - { - pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0); - pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1); - pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1); - } - else - { - pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0); - pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1); - pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1); - } - } - } - - pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); -} - -void InitRasterizerFunctions() -{ - InitRasterizerFuncs(); -} - -// Selector for correct templated RasterizeTriangle function -PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples, - bool IsCenter, - bool IsConservative, - SWR_INPUT_COVERAGE InputCoverage, - uint32_t EdgeEnable, - bool RasterizeScissorEdges) -{ - SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT); - SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT); - SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT); - - PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage] - [EdgeEnable][RasterizeScissorEdges]; - SWR_ASSERT(func); - - return func; -} diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h deleted file mode 100644 index f15cc193129..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h +++ /dev/null @@ -1,237 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file rasterizer.h - * - * @brief Definitions for the rasterizer. - * - ******************************************************************************/ -#pragma once - -#include "context.h" -#include <type_traits> -#include "conservativeRast.h" -#include "multisample.h" - -void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData); -void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData); -void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData); -void InitRasterizerFunctions(); - -INLINE -__m128i fpToFixedPoint(const __m128 vIn) -{ - __m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE)); - return _mm_cvtps_epi32(vFixed); -} - -enum TriEdgesStates -{ - STATE_NO_VALID_EDGES = 0, - STATE_E0_E1_VALID, - STATE_E0_E2_VALID, - STATE_E1_E2_VALID, - STATE_ALL_EDGES_VALID, - STATE_VALID_TRI_EDGE_COUNT, -}; - -enum TriEdgesValues -{ - NO_VALID_EDGES = 0, - E0_E1_VALID = 0x3, - E0_E2_VALID = 0x5, - E1_E2_VALID = 0x6, - ALL_EDGES_VALID = 0x7, - VALID_TRI_EDGE_COUNT, -}; - -// Selector for correct templated RasterizeTriangle function -PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples, - bool IsCenter, - bool IsConservative, - SWR_INPUT_COVERAGE InputCoverage, - uint32_t EdgeEnable, - bool RasterizeScissorEdges); - -////////////////////////////////////////////////////////////////////////// -/// @brief ValidTriEdges convenience typedefs used for templated function -/// specialization supported Fixed Point precisions -typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> AllEdgesValidT; -typedef std::integral_constant<uint32_t, E0_E1_VALID> E0E1ValidT; -typedef std::integral_constant<uint32_t, E0_E2_VALID> E0E2ValidT; -typedef std::integral_constant<uint32_t, E1_E2_VALID> E1E2ValidT; -typedef std::integral_constant<uint32_t, NO_VALID_EDGES> NoEdgesValidT; - -typedef std::integral_constant<uint32_t, STATE_ALL_EDGES_VALID> StateAllEdgesValidT; -typedef std::integral_constant<uint32_t, STATE_E0_E1_VALID> StateE0E1ValidT; -typedef std::integral_constant<uint32_t, STATE_E0_E2_VALID> StateE0E2ValidT; -typedef std::integral_constant<uint32_t, STATE_E1_E2_VALID> StateE1E2ValidT; -typedef std::integral_constant<uint32_t, STATE_NO_VALID_EDGES> StateNoEdgesValidT; - -// some specializations to convert from edge state to edge bitmask values -template <typename EdgeMask> -struct EdgeMaskVal -{ - static_assert(EdgeMask::value > STATE_ALL_EDGES_VALID, - "Primary EdgeMaskVal shouldn't be instantiated"); -}; - -template <> -struct EdgeMaskVal<StateAllEdgesValidT> -{ - typedef AllEdgesValidT T; -}; - -template <> -struct EdgeMaskVal<StateE0E1ValidT> -{ - typedef E0E1ValidT T; -}; - -template <> -struct EdgeMaskVal<StateE0E2ValidT> -{ - typedef E0E2ValidT T; -}; - -template <> -struct EdgeMaskVal<StateE1E2ValidT> -{ - typedef E1E2ValidT T; -}; - -template <> -struct EdgeMaskVal<StateNoEdgesValidT> -{ - typedef NoEdgesValidT T; -}; - -INLINE uint32_t EdgeValToEdgeState(uint32_t val) -{ - SWR_ASSERT(val < VALID_TRI_EDGE_COUNT, "Unexpected tri edge mask"); - static const uint32_t edgeValToEdgeState[VALID_TRI_EDGE_COUNT] = {0, 0, 0, 1, 0, 2, 3, 4}; - return edgeValToEdgeState[val]; -} - -////////////////////////////////////////////////////////////////////////// -/// @struct RasterScissorEdgesT -/// @brief Primary RasterScissorEdgesT templated struct that holds compile -/// time information about the number of edges needed to be rasterized, -/// If either the scissor rect or conservative rast is enabled, -/// the scissor test is enabled and the rasterizer will test -/// 3 triangle edges + 4 scissor edges for coverage. -/// @tparam RasterScissorEdgesT: number of multisamples -/// @tparam ConservativeT: is this a conservative rasterization -/// @tparam EdgeMaskT: Which edges are valid(not degenerate) -template <typename RasterScissorEdgesT, typename ConservativeT, typename EdgeMaskT> -struct RasterEdgeTraits -{ - typedef std::true_type RasterizeScissorEdgesT; - typedef std::integral_constant<uint32_t, 7> NumEdgesT; - // typedef std::integral_constant<uint32_t, EdgeMaskT::value> ValidEdgeMaskT; - typedef typename EdgeMaskVal<EdgeMaskT>::T ValidEdgeMaskT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief specialization of RasterEdgeTraits. If neither scissor rect -/// nor conservative rast is enabled, only test 3 triangle edges -/// for coverage -template <typename EdgeMaskT> -struct RasterEdgeTraits<std::false_type, std::false_type, EdgeMaskT> -{ - typedef std::false_type RasterizeScissorEdgesT; - typedef std::integral_constant<uint32_t, 3> NumEdgesT; - // no need for degenerate edge masking in non-conservative case; rasterize all triangle edges - typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> ValidEdgeMaskT; -}; - -////////////////////////////////////////////////////////////////////////// -/// @struct RasterizerTraits -/// @brief templated struct that holds compile time information used -/// during rasterization. Inherits EdgeTraits and ConservativeRastBETraits. -/// @tparam NumSamplesT: number of multisamples -/// @tparam ConservativeT: is this a conservative rasterization -/// @tparam InputCoverageT: what type of input coverage is the PS expecting? -/// (only used with conservative rasterization) -/// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor? -template <typename NumSamplesT, - typename CenterPatternT, - typename ConservativeT, - typename InputCoverageT, - typename EdgeEnableT, - typename RasterScissorEdgesT> -struct _RasterizerTraits : public ConservativeRastBETraits<ConservativeT, InputCoverageT>, - public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT> -{ - typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value), - CenterPatternT::value> - MT; - - /// Fixed point precision the rasterizer is using - typedef FixedPointTraits<Fixed_16_8> PrecisionT; - /// Fixed point precision of the edge tests used during rasterization - typedef FixedPointTraits<Fixed_X_16> EdgePrecisionT; - - // If conservative rast or MSAA center pattern is enabled, only need a single sample coverage - // test, with the result copied to all samples - typedef std::integral_constant<int, ConservativeT::value ? 1 : MT::numCoverageSamples> - NumCoverageSamplesT; - - static_assert( - EdgePrecisionT::BitsT::value >= - ConservativeRastBETraits<ConservativeT, - InputCoverageT>::ConservativePrecisionT::BitsT::value, - "Rasterizer edge fixed point precision < required conservative rast precision"); - - /// constants used to offset between different types of raster tiles - static const int colorRasterTileStep{ - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) * - MT::numSamples}; - static const int depthRasterTileStep{ - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) * - MT::numSamples}; - static const int stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * - (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) * - MT::numSamples}; - static const int colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * - colorRasterTileStep}; - static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * - depthRasterTileStep}; - static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * - stencilRasterTileStep}; -}; - -template <uint32_t NumSamplesT, - uint32_t CenterPatternT, - uint32_t ConservativeT, - uint32_t InputCoverageT, - uint32_t EdgeEnableT, - uint32_t RasterScissorEdgesT> -struct RasterizerTraits final - : public _RasterizerTraits<std::integral_constant<uint32_t, NumSamplesT>, - std::integral_constant<bool, CenterPatternT != 0>, - std::integral_constant<bool, ConservativeT != 0>, - std::integral_constant<uint32_t, InputCoverageT>, - std::integral_constant<uint32_t, EdgeEnableT>, - std::integral_constant<bool, RasterScissorEdgesT != 0>> -{ -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h deleted file mode 100644 index 2153fe653b1..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h +++ /dev/null @@ -1,1542 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file rasterizer.cpp - * - * @brief Implementation for the rasterizer. - * - ******************************************************************************/ - -#include <vector> -#include <algorithm> - -#include "rasterizer.h" -#include "rdtsc_core.h" -#include "backend.h" -#include "utils.h" -#include "frontend.h" -#include "tilemgr.h" -#include "memory/tilingtraits.h" - -extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT] - [STATE_VALID_TRI_EDGE_COUNT][2]; - -template <uint32_t numSamples = 1> -void GetRenderHotTiles(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t macroID, - uint32_t x, - uint32_t y, - RenderOutputBuffers& renderBuffers, - uint32_t renderTargetArrayIndex); -template <typename RT> -void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers); -template <typename RT> -void StepRasterTileY(uint32_t colorHotTileMask, - RenderOutputBuffers& buffers, - RenderOutputBuffers& startBufferRow); - -#define MASKTOVEC(i3, i2, i1, i0) \ - { \ - -i0, -i1, -i2, -i3 \ - } -static const __m256d gMaskToVecpd[] = { - MASKTOVEC(0, 0, 0, 0), - MASKTOVEC(0, 0, 0, 1), - MASKTOVEC(0, 0, 1, 0), - MASKTOVEC(0, 0, 1, 1), - MASKTOVEC(0, 1, 0, 0), - MASKTOVEC(0, 1, 0, 1), - MASKTOVEC(0, 1, 1, 0), - MASKTOVEC(0, 1, 1, 1), - MASKTOVEC(1, 0, 0, 0), - MASKTOVEC(1, 0, 0, 1), - MASKTOVEC(1, 0, 1, 0), - MASKTOVEC(1, 0, 1, 1), - MASKTOVEC(1, 1, 0, 0), - MASKTOVEC(1, 1, 0, 1), - MASKTOVEC(1, 1, 1, 0), - MASKTOVEC(1, 1, 1, 1), -}; - -struct POS -{ - int32_t x, y; -}; - -struct EDGE -{ - double a, b; // a, b edge coefficients in fix8 - double stepQuadX; // step to adjacent horizontal quad in fix16 - double stepQuadY; // step to adjacent vertical quad in fix16 - double stepRasterTileX; // step to adjacent horizontal raster tile in fix16 - double stepRasterTileY; // step to adjacent vertical raster tile in fix16 - - __m256d vQuadOffsets; // offsets for 4 samples of a quad - __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief rasterize a raster tile partially covered by the triangle -/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster -/// tile -/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C) -/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad. -/// Used to step between quads when sweeping over the raster tile. -template <uint32_t NumEdges, typename EdgeMaskT> -INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT* pDC, - double startEdges[NumEdges], - EDGE* pRastEdges) -{ - uint64_t coverageMask = 0; - - __m256d vEdges[NumEdges]; - __m256d vStepX[NumEdges]; - __m256d vStepY[NumEdges]; - - for (uint32_t e = 0; e < NumEdges; ++e) - { - // Step to the pixel sample locations of the 1st quad - vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets); - - // compute step to next quad (mul by 2 in x and y direction) - vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX); - vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY); - } - - // fast unrolled version for 8x8 tile -#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8 - int edgeMask[NumEdges]; - uint64_t mask; - - auto eval_lambda = [&](int e) { edgeMask[e] = _mm256_movemask_pd(vEdges[e]); }; - auto update_lambda = [&](int e) { mask &= edgeMask[e]; }; - auto incx_lambda = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); }; - auto incy_lambda = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]); }; - auto decx_lambda = [&](int e) { vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]); }; - -// evaluate which pixels in the quad are covered -#define EVAL UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda); - - // update coverage mask - // if edge 0 is degenerate and will be skipped; init the mask -#define UPDATE_MASK(bit) \ - if (std::is_same<EdgeMaskT, E1E2ValidT>::value || \ - std::is_same<EdgeMaskT, NoEdgesValidT>::value) \ - { \ - mask = 0xf; \ - } \ - else \ - { \ - mask = edgeMask[0]; \ - } \ - UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \ - coverageMask |= (mask << bit); - - // step in the +x direction to the next quad -#define INCX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda); - - // step in the +y direction to the next quad -#define INCY UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda); - - // step in the -x direction to the next quad -#define DECX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda); - - // sweep 2x2 quad back and forth through the raster tile, - // computing coverage masks for the entire tile - - // raster tile - // 0 1 2 3 4 5 6 7 - // x x - // x x ------------------> - // x x | - // <-----------------x x V - // .. - - // row 0 - EVAL; - UPDATE_MASK(0); - INCX; - EVAL; - UPDATE_MASK(4); - INCX; - EVAL; - UPDATE_MASK(8); - INCX; - EVAL; - UPDATE_MASK(12); - INCY; - - // row 1 - EVAL; - UPDATE_MASK(28); - DECX; - EVAL; - UPDATE_MASK(24); - DECX; - EVAL; - UPDATE_MASK(20); - DECX; - EVAL; - UPDATE_MASK(16); - INCY; - - // row 2 - EVAL; - UPDATE_MASK(32); - INCX; - EVAL; - UPDATE_MASK(36); - INCX; - EVAL; - UPDATE_MASK(40); - INCX; - EVAL; - UPDATE_MASK(44); - INCY; - - // row 3 - EVAL; - UPDATE_MASK(60); - DECX; - EVAL; - UPDATE_MASK(56); - DECX; - EVAL; - UPDATE_MASK(52); - DECX; - EVAL; - UPDATE_MASK(48); -#else - uint32_t bit = 0; - for (uint32_t y = 0; y < KNOB_TILE_Y_DIM / 2; ++y) - { - __m256d vStartOfRowEdge[NumEdges]; - for (uint32_t e = 0; e < NumEdges; ++e) - { - vStartOfRowEdge[e] = vEdges[e]; - } - - for (uint32_t x = 0; x < KNOB_TILE_X_DIM / 2; ++x) - { - int edgeMask[NumEdges]; - for (uint32_t e = 0; e < NumEdges; ++e) - { - edgeMask[e] = _mm256_movemask_pd(vEdges[e]); - } - - uint64_t mask = edgeMask[0]; - for (uint32_t e = 1; e < NumEdges; ++e) - { - mask &= edgeMask[e]; - } - coverageMask |= (mask << bit); - - // step to the next pixel in the x - for (uint32_t e = 0; e < NumEdges; ++e) - { - vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); - } - bit += 4; - } - - // step to the next row - for (uint32_t e = 0; e < NumEdges; ++e) - { - vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]); - } - } -#endif - return coverageMask; -} -// Top left rule: -// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge -// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it -// is a 'left' edge Top left: a sample is in if it is a top or left edge. Out: !(horizontal && -// above) = !horizontal && below Out: !horizontal && left = !(!horizontal && left) = horizontal and -// right -INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d& vEdge) -{ - // if vA < 0, vC-- - // if vA == 0 && vB < 0, vC-- - - __m256d vEdgeOut = vEdge; - __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0)); - - // if vA < 0 (line is not horizontal and below) - int msk = _mm_movemask_ps(_mm_castsi128_ps(vA)); - - // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri) - __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128()); - int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); - msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); - - // if either of these are true and we're on the line (edge == 0), bump it outside the line - vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief calculates difference in precision between the result of manh -/// calculation and the edge precision, based on compile time trait values -template <typename RT> -constexpr int64_t ManhToEdgePrecisionAdjust() -{ - static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= - RT::EdgePrecisionT::BitsT::value, - "Inadequate precision of result of manh calculation "); - return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - - RT::EdgePrecisionT::BitsT::value); -} - -////////////////////////////////////////////////////////////////////////// -/// @struct adjustEdgeConservative -/// @brief Primary template definition used for partially specializing -/// the adjustEdgeConservative function. This struct should never -/// be instantiated. -/// @tparam RT: rasterizer traits -/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting? -template <typename RT, typename ConservativeEdgeOffsetT> -struct adjustEdgeConservative -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs calculations to adjust each edge of a triangle away - /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y - /// direction. - /// - /// Uncertainty regions arise from fixed point rounding, which - /// can snap a vertex +/- by min fixed point value. - /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners. - /// This allows the rasterizer to test for coverage only at the pixel center, - /// instead of having to test individual pixel corners for conservative coverage - INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge) - { - // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge - // away from the pixel center (in the direction of the edge normal A/B) - - // edge = Ax + Bx + C - (manh/e) - // manh = manhattan distance = abs(A) + abs(B) - // e = absolute rounding error from snapping from float to fixed point precision - - // 'fixed point' multiply (in double to be avx1 friendly) - // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example - __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), - vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi)); - __m256d manh = - _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)), - _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value))); - - static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= - RT::EdgePrecisionT::BitsT::value, - "Inadequate precision of result of manh calculation "); - - // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the - // same precision since we're doing fixed math in double format, multiply by multiples of - // 1/2 instead of a bit shift right - manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5)); - - // move the edge away from the pixel center by the required conservative precision + 1/2 - // pixel this allows the rasterizer to do a single conservative coverage test to see if the - // primitive intersects the pixel at all - vEdge = _mm256_sub_pd(vEdge, manh); - }; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief adjustEdgeConservative specialization where no edge offset is needed -template <typename RT> -struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>> -{ - INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge){}; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief calculates the distance a degenerate BBox needs to be adjusted -/// for conservative rast based on compile time trait values -template <typename RT> -constexpr int64_t ConservativeScissorOffset() -{ - static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, - "Rasterizer precision > conservative precision"); - // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox - // when calculating scissor edges - typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> - DegenerateEdgeOffsetT; - // 1/2 pixel edge offset + conservative offset - degenerateTriangle - return RT::ConservativeEdgeOffsetT::value - - (DegenerateEdgeOffsetT::value - << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value)); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Performs calculations to adjust each a vector of evaluated edges out -/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y -/// direction. -template <typename RT> -INLINE void adjustScissorEdge(const double a, const double b, __m256d& vEdge) -{ - int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); - int64_t manh = - ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> - ManhToEdgePrecisionAdjust<RT>(); - vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh)); -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Performs calculations to adjust each a scalar evaluated edge out -/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y -/// direction. -template <typename RT, typename OffsetT> -INLINE double adjustScalarEdge(const double a, const double b, const double Edge) -{ - int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); - int64_t manh = - ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>(); - return (Edge - manh); -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Perform any needed adjustments to evaluated triangle edges -template <typename RT, typename EdgeOffsetT> -struct adjustEdgesFix16 -{ - INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge) - { - static_assert( - std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value, - "Edge equation expected to be in x.16 fixed point"); - - static_assert(RT::IsConservativeT::value, - "Edge offset assumes conservative rasterization is enabled"); - - // need to apply any edge offsets before applying the top-left rule - adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge); - - adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Perform top left adjustments to evaluated triangle edges -template <typename RT> -struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>> -{ - INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge) - { - adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); - } -}; - -// max(abs(dz/dx), abs(dz,dy) -INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) -{ - /* - // evaluate i,j at (0,0) - float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; - float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; - - // evaluate i,j at (1,0) - float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; - float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; - - // compute dz/dx - float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2]; - float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2]; - float dzdx = abs(d10 - d00); - - // evaluate i,j at (0,1) - float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2]; - float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2]; - - float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2]; - float dzdy = abs(d01 - d00); - */ - - // optimized version of above - float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0])); - float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1])); - - return std::max(dzdx, dzdy); -} - -INLINE float -ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) -{ - if (pState->depthFormat == R24_UNORM_X8_TYPELESS) - { - return (1.0f / (1 << 24)); - } - else if (pState->depthFormat == R16_UNORM) - { - return (1.0f / (1 << 16)); - } - else - { - SWR_ASSERT(pState->depthFormat == R32_FLOAT); - - // for f32 depth, factor = 2^(exponent(max(abs(z) - 23) - float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); - uint32_t zMaxInt = *(uint32_t*)&zMax; - zMaxInt &= 0x7f800000; - zMax = *(float*)&zMaxInt; - - return zMax * (1.0f / (1 << 23)); - } -} - -INLINE float -ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) -{ - if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0) - { - return 0.0f; - } - - float scale = pState->slopeScaledDepthBias; - if (scale != 0.0f) - { - scale *= ComputeMaxDepthSlope(pTri); - } - - float bias = pState->depthBias; - if (!pState->depthBiasPreAdjusted) - { - bias *= ComputeBiasFactor(pState, pTri, z); - } - bias += scale; - - if (pState->depthBiasClamp > 0.0f) - { - bias = std::min(bias, pState->depthBiasClamp); - } - else if (pState->depthBiasClamp < 0.0f) - { - bias = std::max(bias, pState->depthBiasClamp); - } - - return bias; -} - -// Prevent DCE by writing coverage mask from rasterizer to volatile -#if KNOB_ENABLE_TOSS_POINTS -__declspec(thread) volatile uint64_t gToss; -#endif - -static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; -// try to avoid _chkstk insertions; make this thread local -static THREAD -OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib]; - -INLINE -void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) -{ - edge.a = a; - edge.b = b; - - // compute constant steps to adjacent quads - edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE)); - edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE)); - - // compute constant steps to adjacent raster tiles - edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE)); - edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE)); - - // compute quad offsets - const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0); - const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0); - - __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8); - __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8); - edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16); - - // compute raster tile offsets - const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd( - (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0); - const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd( - (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, 0, 0); - - __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8); - __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8); - edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16); -} - -INLINE -void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge) -{ - ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Primary template definition used for partially specializing -/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel -/// corner to sample position, and test for coverage -/// @tparam sampleCount: multisample count -template <typename NumSamplesT> -INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], - const __m256d* vEdgeFix16, - int32_t& mask0, - int32_t& mask1, - int32_t& mask2) -{ - __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; - // evaluate edge equations at the tile multisample bounding box - vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); - vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); - vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); - mask0 = _mm256_movemask_pd(vSampleBboxTest0); - mask1 = _mm256_movemask_pd(vSampleBboxTest1); - mask2 = _mm256_movemask_pd(vSampleBboxTest2); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated -/// when only rasterizing a single coverage test point -template <> -INLINE void UpdateEdgeMasks<SingleSampleT>( - const __m256d (&)[3], const __m256d* vEdgeFix16, int32_t& mask0, int32_t& mask1, int32_t& mask2) -{ - mask0 = _mm256_movemask_pd(vEdgeFix16[0]); - mask1 = _mm256_movemask_pd(vEdgeFix16[1]); - mask2 = _mm256_movemask_pd(vEdgeFix16[2]); -} - -////////////////////////////////////////////////////////////////////////// -/// @struct ComputeScissorEdges -/// @brief Primary template definition. Allows the function to be generically -/// called. When paired with below specializations, will result in an empty -/// inlined function if scissor is not enabled -/// @tparam RasterScissorEdgesT: is scissor enabled? -/// @tparam IsConservativeT: is conservative rast enabled? -/// @tparam RT: rasterizer traits -template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT> -struct ComputeScissorEdges -{ - INLINE ComputeScissorEdges(const SWR_RECT& triBBox, - const SWR_RECT& scissorBBox, - const int32_t x, - const int32_t y, - EDGE (&rastEdges)[RT::NumEdgesT::value], - __m256d (&vEdgeFix16)[7]){}; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial -/// specialization. Instantiated when conservative rast and scissor are enabled -template <typename RT> -struct ComputeScissorEdges<std::true_type, std::true_type, RT> -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, - /// evaluate edge equations and offset them away from pixel center. - INLINE ComputeScissorEdges(const SWR_RECT& triBBox, - const SWR_RECT& scissorBBox, - const int32_t x, - const int32_t y, - EDGE (&rastEdges)[RT::NumEdgesT::value], - __m256d (&vEdgeFix16)[7]) - { - // if conservative rasterizing, triangle bbox intersected with scissor bbox is used - SWR_RECT scissor; - scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin); - scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax); - scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin); - scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax); - - POS topLeft{scissor.xmin, scissor.ymin}; - POS bottomLeft{scissor.xmin, scissor.ymax}; - POS topRight{scissor.xmax, scissor.ymin}; - POS bottomRight{scissor.xmax, scissor.ymax}; - - // construct 4 scissor edges in ccw direction - ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); - ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); - ComputeEdgeData(bottomRight, topRight, rastEdges[5]); - ComputeEdgeData(topRight, topLeft, rastEdges[6]); - - vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + - (rastEdges[3].b * (y - scissor.ymin))); - vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + - (rastEdges[4].b * (y - scissor.ymax))); - vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + - (rastEdges[5].b * (y - scissor.ymax))); - vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + - (rastEdges[6].b * (y - scissor.ymin))); - - // if conservative rasterizing, need to bump the scissor edges out by the conservative - // uncertainty distance, else do nothing - adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]); - adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]); - adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]); - adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]); - - // Upper left rule for scissor - vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); - vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial -/// specialization. Instantiated when scissor is enabled and conservative rast -/// is disabled. -template <typename RT> -struct ComputeScissorEdges<std::true_type, std::false_type, RT> -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Compute scissor edge vectors and evaluate edge equations - INLINE ComputeScissorEdges(const SWR_RECT&, - const SWR_RECT& scissorBBox, - const int32_t x, - const int32_t y, - EDGE (&rastEdges)[RT::NumEdgesT::value], - __m256d (&vEdgeFix16)[7]) - { - const SWR_RECT& scissor = scissorBBox; - POS topLeft{scissor.xmin, scissor.ymin}; - POS bottomLeft{scissor.xmin, scissor.ymax}; - POS topRight{scissor.xmax, scissor.ymin}; - POS bottomRight{scissor.xmax, scissor.ymax}; - - // construct 4 scissor edges in ccw direction - ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); - ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); - ComputeEdgeData(bottomRight, topRight, rastEdges[5]); - ComputeEdgeData(topRight, topLeft, rastEdges[6]); - - vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + - (rastEdges[3].b * (y - scissor.ymin))); - vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + - (rastEdges[4].b * (y - scissor.ymax))); - vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + - (rastEdges[5].b * (y - scissor.ymax))); - vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + - (rastEdges[6].b * (y - scissor.ymin))); - - // Upper left rule for scissor - vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); - vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Primary function template for TrivialRejectTest. Should -/// never be called, but TemplateUnroller instantiates a few unused values, -/// so it calls a runtime assert instead of a static_assert. -template <typename ValidEdgeMaskT> -INLINE bool TrivialRejectTest(const int, const int, const int) -{ - SWR_INVALID("Primary templated function should never be called"); - return false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0 -/// and edge 1 for trivial coverage reject -template <> -INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int) -{ - return (!(mask0 && mask1)) ? true : false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0 -/// and edge 2 for trivial coverage reject -template <> -INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2) -{ - return (!(mask0 && mask2)) ? true : false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1 -/// and edge 2 for trivial coverage reject -template <> -INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2) -{ - return (!(mask1 && mask2)) ? true : false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all -/// primitive edges for trivial coverage reject -template <> -INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2) -{ - return (!(mask0 && mask1 && mask2)) ? true : false; - ; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate -/// point, so return false and rasterize against conservative BBox -template <> -INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int) -{ - return false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Primary function template for TrivialAcceptTest. Always returns -/// false, since it will only be called for degenerate tris, and as such -/// will never cover the entire raster tile -template <typename ScissorEnableT> -INLINE bool TrivialAcceptTest(const int, const int, const int) -{ - return false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all -/// edge masks for a fully covered raster tile -template <> -INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2) -{ - return ((mask0 & mask1 & mask2) == 0xf); -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Primary function template for GenerateSVInnerCoverage. Results -/// in an empty function call if SVInnerCoverage isn't requested -template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> -struct GenerateSVInnerCoverage -{ - INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t&){}; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Specialization of GenerateSVInnerCoverage where all edges -/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated -/// edge values from OuterConservative to InnerConservative and rasterizes. -template <typename RT> -struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT> -{ - INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, - uint32_t workerId, - EDGE* pRastEdges, - double* pStartQuadEdges, - uint64_t& innerCoverageMask) - { - double startQuadEdgesAdj[RT::NumEdgesT::value]; - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>( - pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]); - } - - // not trivial accept or reject, must rasterize full tile - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizePartial, pDC->drawId); - innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>( - pDC, startQuadEdgesAdj, pRastEdges); - RDTSC_END(pDC->pContext->pBucketMgr, BERasterizePartial, 0); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results -/// in an empty function call if SVInnerCoverage isn't requested -template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> -struct UpdateEdgeMasksInnerConservative -{ - INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], - const __m256d*, - const __m128i, - const __m128i, - int32_t&, - int32_t&, - int32_t&){}; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges -/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges -/// evaluated at raster tile corners to inner conservative position and -/// updates edge masks -template <typename RT> -struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT> -{ - INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], - const __m256d* vEdgeFix16, - const __m128i vAi, - const __m128i vBi, - int32_t& mask0, - int32_t& mask1, - int32_t& mask2) - { - __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]}; - - // instead of keeping 2 copies of evaluated edges around, just compensate for the outer - // conservative evaluated edge when adjusting the edge in for inner conservative tests - adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>( - vAi, vBi, vTempEdge[0]); - adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>( - vAi, vBi, vTempEdge[1]); - adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>( - vAi, vBi, vTempEdge[2]); - - UpdateEdgeMasks<typename RT::NumCoverageSamplesT>( - vEdgeTileBbox, vTempEdge, mask0, mask1, mask2); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage -/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot -/// cover an entire raster tile, set mask0 to 0 to force it down the -/// rastierizePartialTile path -template <typename RT, typename ValidEdgeMaskT> -struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT> -{ - INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], - const __m256d*, - const __m128i, - const __m128i, - int32_t& mask0, - int32_t&, - int32_t&) - { - // set one mask to zero to force the triangle down the rastierizePartialTile path - mask0 = 0; - } -}; - -template <typename RT> -void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) -{ - const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pDesc); -#if KNOB_ENABLE_TOSS_POINTS - if (KNOB_TOSS_BIN_TRIS) - { - return; - } -#endif - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeTriangle, pDC->drawId); - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BETriangleSetup, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; - - OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; - triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; - - __m128 vX, vY, vZ, vRecipW; - - // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care - // eg: vX = [x0 x1 x2 dc] - vX = _mm_load_ps(workDesc.pTriBuffer); - vY = _mm_load_ps(workDesc.pTriBuffer + 4); - vZ = _mm_load_ps(workDesc.pTriBuffer + 8); - vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); - - // convert to fixed point - static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, - "Rasterizer expects 16.8 fixed point precision"); - __m128i vXi = fpToFixedPoint(vX); - __m128i vYi = fpToFixedPoint(vY); - - // quantize floating point position to fixed point precision - // to prevent attribute creep around the triangle vertices - vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); - vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); - - // triangle setup - A and B edge equation coefs - __m128 vA, vB; - triangleSetupAB(vX, vY, vA, vB); - - __m128i vAi, vBi; - triangleSetupABInt(vXi, vYi, vAi, vBi); - - // determinant - float det = calcDeterminantInt(vAi, vBi); - - // Verts in Pixel Coordinate Space at this point - // Det > 0 = CW winding order - // Convert CW triangles to CCW - if (det > 0.0) - { - vA = _mm_mul_ps(vA, _mm_set1_ps(-1)); - vB = _mm_mul_ps(vB, _mm_set1_ps(-1)); - vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1)); - vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1)); - det = -det; - } - - __m128 vC; - // Finish triangle setup - C edge coef - triangleSetupC(vX, vY, vA, vB, vC); - - if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) - { - // If we have degenerate edge(s) to rasterize, set I and J coefs - // to 0 for constant interpolation of attributes - triDesc.I[0] = 0.0f; - triDesc.I[1] = 0.0f; - triDesc.I[2] = 0.0f; - triDesc.J[0] = 0.0f; - triDesc.J[1] = 0.0f; - triDesc.J[2] = 0.0f; - - // Degenerate triangles have no area - triDesc.recipDet = 0.0f; - } - else - { - // only extract coefs for 2 of the barycentrics; the 3rd can be - // determined from the barycentric equation: - // i + j + k = 1 <=> k = 1 - j - i - _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1); - _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1); - _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1); - _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2); - _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); - _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); - - // compute recipDet, used to calculate barycentric i and j in the backend - triDesc.recipDet = 1.0f / det; - } - - OSALIGNSIMD(float) oneOverW[4]; - _mm_store_ps(oneOverW, vRecipW); - triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; - triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; - triDesc.OneOverW[2] = oneOverW[2]; - - // calculate perspective correct coefs per vertex attrib - float* pPerspAttribs = perspAttribsTLS; - float* pAttribs = workDesc.pAttribs; - triDesc.pPerspAttribs = pPerspAttribs; - triDesc.pAttribs = pAttribs; - float* pRecipW = workDesc.pTriBuffer + 12; - triDesc.pRecipW = pRecipW; - __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); - __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW += 1); - __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW += 1); - for (uint32_t i = 0; i < workDesc.numAttribs; i++) - { - __m128 attribA = _mm_load_ps(pAttribs); - __m128 attribB = _mm_load_ps(pAttribs += 4); - __m128 attribC = _mm_load_ps(pAttribs += 4); - pAttribs += 4; - - attribA = _mm_mul_ps(attribA, vOneOverWV0); - attribB = _mm_mul_ps(attribB, vOneOverWV1); - attribC = _mm_mul_ps(attribC, vOneOverWV2); - - _mm_store_ps(pPerspAttribs, attribA); - _mm_store_ps(pPerspAttribs += 4, attribB); - _mm_store_ps(pPerspAttribs += 4, attribC); - pPerspAttribs += 4; - } - - // compute bary Z - // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) - OSALIGNSIMD(float) a[4]; - _mm_store_ps(a, vZ); - triDesc.Z[0] = a[0] - a[2]; - triDesc.Z[1] = a[1] - a[2]; - triDesc.Z[2] = a[2]; - - // add depth bias - triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); - - // Calc bounding box of triangle - OSALIGNSIMD(SWR_RECT) bbox; - calcBoundingBoxInt(vXi, vYi, bbox); - - const SWR_RECT& scissorInFixedPoint = - state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; - - if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) - { - // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is - // valid - bbox.xmin--; - bbox.xmax++; - bbox.ymin--; - bbox.ymax++; - SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0, - "Conservative rast degenerate handling requires a valid scissor rect"); - } - - // Intersect with scissor/viewport - OSALIGNSIMD(SWR_RECT) intersect; - intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin); - intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax); - intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin); - intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax); - - triDesc.triFlags = workDesc.triFlags; - - // further constrain backend to intersecting bounding box of macro tile and scissored triangle - // bbox - uint32_t macroX, macroY; - MacroTileMgr::getTileIndices(macroTile, macroX, macroY); - int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; - int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; - int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; - int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; - - intersect.xmin = std::max(intersect.xmin, macroBoxLeft); - intersect.ymin = std::max(intersect.ymin, macroBoxTop); - intersect.xmax = std::min(intersect.xmax, macroBoxRight); - intersect.ymax = std::min(intersect.ymax, macroBoxBottom); - - SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && - intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && - intersect.ymax >= 0); - - RDTSC_END(pDC->pContext->pBucketMgr, BETriangleSetup, 0); - - // update triangle desc - uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t numTilesX = maxTileX - minTileX + 1; - uint32_t numTilesY = maxTileY - minTileY + 1; - - if (numTilesX == 0 || numTilesY == 0) - { - RDTSC_EVENT(pDC->pContext->pBucketMgr, BEEmptyTriangle, 1, 0); - RDTSC_END(pDC->pContext->pBucketMgr, BERasterizeTriangle, 1); - return; - } - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStepSetup, pDC->drawId); - - // Step to pixel center of top-left pixel of the triangle bbox - // Align intersect bbox (top/left) to raster tile's (top/left). - int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM)); - int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM)); - - // convenience typedef - typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT; - - // single sample rasterization evaluates edges at pixel center, - // multisample evaluates edges UL pixel corner and steps to each sample position - if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value) - { - // Add 0.5, in fixed point, to offset to pixel center - x += (FIXED_POINT_SCALE / 2); - y += (FIXED_POINT_SCALE / 2); - } - - __m128i vTopLeftX = _mm_set1_epi32(x); - __m128i vTopLeftY = _mm_set1_epi32(y); - - // evaluate edge equations at top-left pixel using 64bit math - // - // line = Ax + By + C - // solving for C: - // C = -Ax - By - // we know x0 and y0 are on the line; plug them in: - // C = -Ax0 - By0 - // plug C back into line equation: - // line = Ax - By - Ax0 - By0 - // line = A(x - x0) + B(y - y0) - // dX = (x-x0), dY = (y-y0) - // so all this simplifies to - // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within - - __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); - __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); - - // evaluate A(dx) and B(dY) for all points - __m256d vAipd = _mm256_cvtepi32_pd(vAi); - __m256d vBipd = _mm256_cvtepi32_pd(vBi); - __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX); - __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY); - - __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd); - __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); - __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); - - // apply any edge adjustments(top-left, crast, etc) - adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge); - - // broadcast respective edge results to all lanes - double* pEdge = (double*)&vEdge; - __m256d vEdgeFix16[7]; - vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]); - vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]); - vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]); - - OSALIGNSIMD(int32_t) aAi[4], aBi[4]; - _mm_store_si128((__m128i*)aAi, vAi); - _mm_store_si128((__m128i*)aBi, vBi); - EDGE rastEdges[RT::NumEdgesT::value]; - - // Compute and store triangle edge data - ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]); - ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]); - ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]); - - // Compute and store triangle edge data if scissor needs to rasterized - ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>( - bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); - - // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile - // used to for testing if entire raster tile is inside a triangle - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets); - } - - // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox - // step sample positions to the raster tile bbox of multisample points - // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples) - // | | - // | | - // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) - __m256d vEdgeTileBbox[3]; - if (NumCoverageSamplesT::value > 1) - { - const SWR_MULTISAMPLE_POS& samplePos = rastState.samplePositions; - const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX(); - const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY(); - - __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); - __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); - - // step edge equation tests from Tile - // used to for testing if entire raster tile is inside a triangle - for (uint32_t e = 0; e < 3; ++e) - { - __m256d vResultAxFix16 = - _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); - __m256d vResultByFix16 = - _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); - vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); - - // adjust for msaa tile bbox edges outward for conservative rast, if enabled - adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>( - vAi, vBi, vEdgeTileBbox[e]); - } - } - - RDTSC_END(pDC->pContext->pBucketMgr, BEStepSetup, 0); - - uint32_t tY = minTileY; - uint32_t tX = minTileX; - uint32_t maxY = maxTileY; - uint32_t maxX = maxTileX; - - RenderOutputBuffers renderBuffers, currentRenderBufferRow; - GetRenderHotTiles<RT::MT::numSamples>(pDC, - workerId, - macroTile, - minTileX, - minTileY, - renderBuffers, - triDesc.triFlags.renderTargetArrayIndex); - currentRenderBufferRow = renderBuffers; - - // rasterize and generate coverage masks per sample - for (uint32_t tileY = tY; tileY <= maxY; ++tileY) - { - __m256d vStartOfRowEdge[RT::NumEdgesT::value]; - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - vStartOfRowEdge[e] = vEdgeFix16[e]; - } - - for (uint32_t tileX = tX; tileX <= maxX; ++tileX) - { - triDesc.anyCoveredSamples = 0; - - // is the corner of the edge outside of the raster tile? (vEdge < 0) - int mask0, mask1, mask2; - UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2); - - for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++) - { - // trivial reject, at least one edge has all 4 corners of raster tile outside - bool trivialReject = - TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2); - - if (!trivialReject) - { - // trivial accept mask - triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; - - // Update the raster tile edge masks based on inner conservative edge offsets, - // if enabled - UpdateEdgeMasksInnerConservative<RT, - typename RT::ValidEdgeMaskT, - typename RT::InputCoverageT>( - vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2); - - // @todo Make this a bit smarter to allow use of trivial accept when: - // 1) scissor/vp intersection rect is raster tile aligned - // 2) raster tile is entirely within scissor/vp intersection rect - if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2)) - { - // trivial accept, all 4 corners of all 3 edges are negative - // i.e. raster tile completely inside triangle - triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; - if (std::is_same<typename RT::InputCoverageT, - InnerConservativeCoverageT>::value) - { - triDesc.innerCoverageMask = 0xffffffffffffffffULL; - } - RDTSC_EVENT(pDC->pContext->pBucketMgr, BETrivialAccept, 1, 0); - } - else - { - __m256d vEdgeAtSample[RT::NumEdgesT::value]; - if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value) - { - // should get optimized out for single sample case (global value - // numbering or copy propagation) - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - vEdgeAtSample[e] = vEdgeFix16[e]; - } - } - else - { - const SWR_MULTISAMPLE_POS& samplePos = rastState.samplePositions; - __m128i vSampleOffsetXh = samplePos.vXi(sampleNum); - __m128i vSampleOffsetYh = samplePos.vYi(sampleNum); - __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); - __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); - - // step edge equation tests from UL tile corner to pixel sample position - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - __m256d vResultAxFix16 = - _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); - __m256d vResultByFix16 = - _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); - vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); - vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]); - } - } - - double startQuadEdges[RT::NumEdgesT::value]; - const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]); - } - - // not trivial accept or reject, must rasterize full tile - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizePartial, pDC->drawId); - triDesc.coverageMask[sampleNum] = - rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>( - pDC, startQuadEdges, rastEdges); - RDTSC_END(pDC->pContext->pBucketMgr, BERasterizePartial, 0); - - triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; - - // Output SV InnerCoverage, if needed - GenerateSVInnerCoverage<RT, - typename RT::ValidEdgeMaskT, - typename RT::InputCoverageT>( - pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask); - } - } - else - { - // if we're calculating coverage per sample, need to store it off. otherwise no - // covered samples, don't need to do anything - if (NumCoverageSamplesT::value > 1) - { - triDesc.coverageMask[sampleNum] = 0; - } - RDTSC_EVENT(pDC->pContext->pBucketMgr, BETrivialReject, 1, 0); - } - } - -#if KNOB_ENABLE_TOSS_POINTS - if (KNOB_TOSS_RS) - { - gToss = triDesc.coverageMask[0]; - } - else -#endif - if (triDesc.anyCoveredSamples) - { - // if conservative rast and MSAA are enabled, conservative coverage for a pixel - // means all samples in that pixel are covered copy conservative coverage result to - // all samples - if (RT::IsConservativeT::value) - { - auto copyCoverage = [&](int sample) { - triDesc.coverageMask[sample] = triDesc.coverageMask[0]; - }; - UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage); - } - - // Track rasterized subspans - AR_EVENT(RasterTileCount(pDC->drawId, 1)); - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId); - backendFuncs.pfnBackend(pDC, - workerId, - tileX << KNOB_TILE_X_DIM_SHIFT, - tileY << KNOB_TILE_Y_DIM_SHIFT, - triDesc, - renderBuffers); - RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0); - } - - // step to the next tile in X - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - vEdgeFix16[e] = - _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); - } - StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers); - } - - // step to the next tile in Y - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - vEdgeFix16[e] = - _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); - } - StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow); - } - - RDTSC_END(pDC->pContext->pBucketMgr, BERasterizeTriangle, 1); -} - -// Get pointers to hot tile memory for color RT, depth, stencil -template <uint32_t numSamples> -void GetRenderHotTiles(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t macroID, - uint32_t tileX, - uint32_t tileY, - RenderOutputBuffers& renderBuffers, - uint32_t renderTargetArrayIndex) -{ - const API_STATE& state = GetApiState(pDC); - SWR_CONTEXT* pContext = pDC->pContext; - HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - uint32_t mx, my; - MacroTileMgr::getTileIndices(macroID, mx, my); - tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx; - tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my; - - // compute tile offset for active hottile buffers - const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8; - uint32_t offset = ComputeTileOffset2D< - TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp>>( - pitch, tileX, tileY); - offset *= numSamples; - - unsigned long rtSlot = 0; - uint32_t colorHottileEnableMask = state.colorHottileEnable; - while (_BitScanForward(&rtSlot, colorHottileEnableMask)) - { - HOTTILE* pColor = pContext->pHotTileMgr->GetHotTile( - pContext, - pDC, - hWorkerPrivateData, - macroID, - (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), - true, - numSamples, - renderTargetArrayIndex); - renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset; - renderBuffers.pColorHotTile[rtSlot] = pColor; - - colorHottileEnableMask &= ~(1 << rtSlot); - } - if (state.depthHottileEnable) - { - const uint32_t pitch = - KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; - uint32_t offset = ComputeTileOffset2D< - TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp>>( - pitch, tileX, tileY); - offset *= numSamples; - HOTTILE* pDepth = pContext->pHotTileMgr->GetHotTile(pContext, - pDC, - hWorkerPrivateData, - macroID, - SWR_ATTACHMENT_DEPTH, - true, - numSamples, - renderTargetArrayIndex); - pDepth->state = HOTTILE_DIRTY; - SWR_ASSERT(pDepth->pBuffer != nullptr); - renderBuffers.pDepth = pDepth->pBuffer + offset; - renderBuffers.pDepthHotTile = pDepth; - } - if (state.stencilHottileEnable) - { - const uint32_t pitch = - KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; - uint32_t offset = ComputeTileOffset2D< - TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp>>( - pitch, tileX, tileY); - offset *= numSamples; - HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, - pDC, - hWorkerPrivateData, - macroID, - SWR_ATTACHMENT_STENCIL, - true, - numSamples, - renderTargetArrayIndex); - pStencil->state = HOTTILE_DIRTY; - SWR_ASSERT(pStencil->pBuffer != nullptr); - renderBuffers.pStencil = pStencil->pBuffer + offset; - renderBuffers.pStencilHotTile = pStencil; - } -} - -template <typename RT> -INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers) -{ - unsigned long rt = 0; - while (_BitScanForward(&rt, colorHotTileMask)) - { - colorHotTileMask &= ~(1 << rt); - buffers.pColor[rt] += RT::colorRasterTileStep; - } - - buffers.pDepth += RT::depthRasterTileStep; - buffers.pStencil += RT::stencilRasterTileStep; -} - -template <typename RT> -INLINE void StepRasterTileY(uint32_t colorHotTileMask, - RenderOutputBuffers& buffers, - RenderOutputBuffers& startBufferRow) -{ - unsigned long rt = 0; - while (_BitScanForward(&rt, colorHotTileMask)) - { - colorHotTileMask &= ~(1 << rt); - startBufferRow.pColor[rt] += RT::colorRasterTileRowStep; - buffers.pColor[rt] = startBufferRow.pColor[rt]; - } - startBufferRow.pDepth += RT::depthRasterTileRowStep; - buffers.pDepth = startBufferRow.pDepth; - - startBufferRow.pStencil += RT::stencilRasterTileRowStep; - buffers.pStencil = startBufferRow.pStencil; -} diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp deleted file mode 100644 index 6329b2ec98e..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#include "rdtsc_core.h" -#include "common/rdtsc_buckets.h" - -// must match CORE_BUCKETS enum order -BUCKET_DESC gCoreBuckets[] = { - {"APIClearRenderTarget", "", true, 0xff0b8bea}, - {"APIDraw", "", true, 0xff000066}, - {"APIDrawWakeAllThreads", "", false, 0xffffffff}, - {"APIDrawIndexed", "", true, 0xff000066}, - {"APIDispatch", "", true, 0xff660000}, - {"APIStoreTiles", "", true, 0xff00ffff}, - {"APIGetDrawContext", "", false, 0xffffffff}, - {"APISync", "", true, 0xff6666ff}, - {"APIWaitForIdle", "", true, 0xff0000ff}, - {"FEProcessDraw", "", true, 0xff009900}, - {"FEProcessDrawIndexed", "", true, 0xff009900}, - {"FEFetchShader", "", false, 0xffffffff}, - {"FEVertexShader", "", false, 0xffffffff}, - {"FEHullShader", "", false, 0xffffffff}, - {"FETessellation", "", false, 0xffffffff}, - {"FEDomainShader", "", false, 0xffffffff}, - {"FEGeometryShader", "", false, 0xffffffff}, - {"FEStreamout", "", false, 0xffffffff}, - {"FEPAAssemble", "", false, 0xffffffff}, - {"FEBinPoints", "", false, 0xff29b854}, - {"FEBinLines", "", false, 0xff29b854}, - {"FEBinTriangles", "", false, 0xff29b854}, - {"FETriangleSetup", "", false, 0xffffffff}, - {"FEViewportCull", "", false, 0xffffffff}, - {"FEGuardbandClip", "", false, 0xffffffff}, - {"FEClipPoints", "", false, 0xffffffff}, - {"FEClipLines", "", false, 0xffffffff}, - {"FEClipTriangles", "", false, 0xffffffff}, - {"FEClipRectangles", "", false, 0xffffffff}, - {"FECullZeroAreaAndBackface", "", false, 0xffffffff}, - {"FECullBetweenCenters", "", false, 0xffffffff}, - {"FEEarlyRastEnter", "", false, 0xffffffff}, - {"FEEarlyRastExit", "", false, 0xffffffff}, - {"FEProcessStoreTiles", "", true, 0xff39c864}, - {"FEProcessInvalidateTiles", "", true, 0xffffffff}, - {"WorkerWorkOnFifoBE", "", false, 0xff40261c}, - {"WorkerFoundWork", "", false, 0xff573326}, - {"BELoadTiles", "", true, 0xffb0e2ff}, - {"BEDispatch", "", true, 0xff00a2ff}, - {"BEClear", "", true, 0xff00ccbb}, - {"BERasterizeLine", "", true, 0xffb26a4e}, - {"BERasterizeTriangle", "", true, 0xffb26a4e}, - {"BETriangleSetup", "", false, 0xffffffff}, - {"BEStepSetup", "", false, 0xffffffff}, - {"BECullZeroArea", "", false, 0xffffffff}, - {"BEEmptyTriangle", "", false, 0xffffffff}, - {"BETrivialAccept", "", false, 0xffffffff}, - {"BETrivialReject", "", false, 0xffffffff}, - {"BERasterizePartial", "", false, 0xffffffff}, - {"BEPixelBackend", "", false, 0xffffffff}, - {"BESetup", "", false, 0xffffffff}, - {"BEBarycentric", "", false, 0xffffffff}, - {"BEEarlyDepthTest", "", false, 0xffffffff}, - {"BEPixelShader", "", false, 0xffffffff}, - {"BESingleSampleBackend", "", false, 0xffffffff}, - {"BEPixelRateBackend", "", false, 0xffffffff}, - {"BESampleRateBackend", "", false, 0xffffffff}, - {"BENullBackend", "", false, 0xffffffff}, - {"BELateDepthTest", "", false, 0xffffffff}, - {"BEOutputMerger", "", false, 0xffffffff}, - {"BEStoreTiles", "", true, 0xff00cccc}, - {"BEEndTile", "", false, 0xffffffff}, -}; -static_assert(NumBuckets == (sizeof(gCoreBuckets) / sizeof(gCoreBuckets[0])), - "RDTSC Bucket enum and description table size mismatched."); - diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h deleted file mode 100644 index 0228275bd47..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h +++ /dev/null @@ -1,185 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#pragma once -#include "knobs.h" - -#include "common/os.h" -#include "common/rdtsc_buckets.h" - -#include <vector> - -/////////////////////////////////////////////////////////////////////////////// -// NOTE: This enum MUST be kept in sync with gCoreBuckets in rdtsc_core.cpp -/////////////////////////////////////////////////////////////////////////////// -enum CORE_BUCKETS -{ - APIClearRenderTarget, - APIDraw, - APIDrawWakeAllThreads, - APIDrawIndexed, - APIDispatch, - APIStoreTiles, - APIGetDrawContext, - APISync, - APIWaitForIdle, - FEProcessDraw, - FEProcessDrawIndexed, - FEFetchShader, - FEVertexShader, - FEHullShader, - FETessellation, - FEDomainShader, - FEGeometryShader, - FEStreamout, - FEPAAssemble, - FEBinPoints, - FEBinLines, - FEBinTriangles, - FETriangleSetup, - FEViewportCull, - FEGuardbandClip, - FEClipPoints, - FEClipLines, - FEClipTriangles, - FEClipRectangles, - FECullZeroAreaAndBackface, - FECullBetweenCenters, - FEEarlyRastEnter, - FEEarlyRastExit, - FEProcessStoreTiles, - FEProcessInvalidateTiles, - WorkerWorkOnFifoBE, - WorkerFoundWork, - BELoadTiles, - BEDispatch, - BEClear, - BERasterizeLine, - BERasterizeTriangle, - BETriangleSetup, - BEStepSetup, - BECullZeroArea, - BEEmptyTriangle, - BETrivialAccept, - BETrivialReject, - BERasterizePartial, - BEPixelBackend, - BESetup, - BEBarycentric, - BEEarlyDepthTest, - BEPixelShader, - BESingleSampleBackend, - BEPixelRateBackend, - BESampleRateBackend, - BENullBackend, - BELateDepthTest, - BEOutputMerger, - BEStoreTiles, - BEEndTile, - - NumBuckets -}; - -void rdtscReset(BucketManager* pBucketMgr); -void rdtscInit(BucketManager* pBucketMgr, int threadId); -void rdtscStart(BucketManager* pBucketMgr, uint32_t bucketId); -void rdtscStop(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count, uint64_t drawId); -void rdtscEvent(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count1, uint32_t count2); -void rdtscEndFrame(BucketManager* pBucketMgr); - -#ifdef KNOB_ENABLE_RDTSC -#define RDTSC_RESET(pBucketMgr) rdtscReset(pBucketMgr) -#define RDTSC_INIT(pBucketMgr, threadId) rdtscInit(pBucketMgr,threadId) -#define RDTSC_START(pBucketMgr, bucket) rdtscStart(pBucketMgr, bucket) -#define RDTSC_STOP(pBucketMgr, bucket, count, draw) rdtscStop(pBucketMgr, bucket, count, draw) -#define RDTSC_EVENT(pBucketMgr, bucket, count1, count2) rdtscEvent(pBucketMgr, bucket, count1, count2) -#define RDTSC_ENDFRAME(pBucketMgr) rdtscEndFrame(pBucketMgr) -#else -#define RDTSC_RESET(pBucketMgr) -#define RDTSC_INIT(pBucketMgr, threadId) -#define RDTSC_START(pBucketMgr, bucket) -#define RDTSC_STOP(pBucketMgr, bucket, count, draw) -#define RDTSC_EVENT(pBucketMgr, bucket, count1, count2) -#define RDTSC_ENDFRAME(pBucketMgr) -#endif - -extern BUCKET_DESC gCoreBuckets[]; - -INLINE void rdtscReset(BucketManager *pBucketMgr) -{ - pBucketMgr->mCurrentFrame = 0; - pBucketMgr->ClearThreads(); -} - -INLINE void rdtscInit(BucketManager* pBucketMgr, int threadId) -{ - // register all the buckets once - if (!pBucketMgr->mBucketsInitialized && (threadId == 0)) - { - pBucketMgr->mBucketMap.resize(NumBuckets); - for (uint32_t i = 0; i < NumBuckets; ++i) - { - pBucketMgr->mBucketMap[i] = pBucketMgr->RegisterBucket(gCoreBuckets[i]); - } - pBucketMgr->mBucketsInitialized = true; - } - - std::string name = threadId == 0 ? "API" : "WORKER"; - pBucketMgr->RegisterThread(name); -} - -INLINE void rdtscStart(BucketManager* pBucketMgr, uint32_t bucketId) -{ - uint32_t id = pBucketMgr->mBucketMap[bucketId]; - pBucketMgr->StartBucket(id); -} - -INLINE void rdtscStop(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count, uint64_t drawId) -{ - uint32_t id = pBucketMgr->mBucketMap[bucketId]; - pBucketMgr->StopBucket(id); -} - -INLINE void rdtscEvent(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count1, uint32_t count2) -{ - uint32_t id = pBucketMgr->mBucketMap[bucketId]; - pBucketMgr->AddEvent(id, count1); -} - -INLINE void rdtscEndFrame(BucketManager* pBucketMgr) -{ - pBucketMgr->mCurrentFrame++; - - if (pBucketMgr->mCurrentFrame == KNOB_BUCKETS_START_FRAME && - KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME) - { - pBucketMgr->StartCapture(); - } - - if (pBucketMgr->mCurrentFrame == KNOB_BUCKETS_END_FRAME && - KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME) - { - pBucketMgr->StopCapture(); - pBucketMgr->PrintReport("rdtsc.txt"); - } -} diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h deleted file mode 100644 index 2e758f43753..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h +++ /dev/null @@ -1,95 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file arena.h - * - * @brief RingBuffer - * The RingBuffer class manages all aspects of the ring buffer including - * the head/tail indices, etc. - * - ******************************************************************************/ -#pragma once - -template <typename T> -class RingBuffer -{ -public: - RingBuffer() : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) {} - - ~RingBuffer() { Destroy(); } - - void Init(uint32_t numEntries) - { - SWR_ASSERT(numEntries > 0); - SWR_ASSERT(((1ULL << 32) % numEntries) == 0, - "%d is not evenly divisible into 2 ^ 32. Wrap errors will occur!", - numEntries); - mNumEntries = numEntries; - mpRingBuffer = (T*)AlignedMalloc(sizeof(T) * numEntries, 64); - SWR_ASSERT(mpRingBuffer != nullptr); - memset((void*)mpRingBuffer, 0, sizeof(T) * numEntries); - } - - void Destroy() - { - AlignedFree(mpRingBuffer); - mpRingBuffer = nullptr; - } - - T& operator[](const uint32_t index) - { - SWR_ASSERT(index < mNumEntries); - return mpRingBuffer[index]; - } - - INLINE void Enqueue() - { - mRingHead++; // There's only one producer. - // Assert to find wrap-around cases, NEVER ENABLE DURING CHECKIN!! - // SWR_REL_ASSERT(mRingHead); - } - - INLINE void Dequeue() - { - InterlockedIncrement(&mRingTail); // There are multiple consumers. - } - - INLINE bool IsEmpty() { return (GetHead() == GetTail()); } - - INLINE bool IsFull() - { - uint32_t numEnqueued = GetHead() - GetTail(); - SWR_ASSERT(numEnqueued <= mNumEntries); - - return (numEnqueued == mNumEntries); - } - - INLINE uint32_t GetTail() volatile { return mRingTail; } - INLINE uint32_t GetHead() volatile { return mRingHead; } - -protected: - T* mpRingBuffer; - uint32_t mNumEntries; - - OSALIGNLINE(volatile uint32_t) mRingHead; // Consumer Counter - OSALIGNLINE(volatile uint32_t) mRingTail; // Producer Counter -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h deleted file mode 100644 index 66a23bd9b08..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ /dev/null @@ -1,1240 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file state.h - * - * @brief Definitions for API state. - * - ******************************************************************************/ -// Skipping clang-format due to parsing by simplistic python scripts -// clang-format off -#pragma once - -#include "common/formats.h" -#include "common/intrin.h" -#include "common/rdtsc_buckets.h" -#include <functional> -#include <algorithm> - -using gfxptr_t = unsigned long long; - -////////////////////////////////////////////////////////////////////////// -/// PRIMITIVE_TOPOLOGY. -////////////////////////////////////////////////////////////////////////// -enum PRIMITIVE_TOPOLOGY -{ - TOP_UNKNOWN = 0x0, - TOP_POINT_LIST = 0x1, - TOP_LINE_LIST = 0x2, - TOP_LINE_STRIP = 0x3, - TOP_TRIANGLE_LIST = 0x4, - TOP_TRIANGLE_STRIP = 0x5, - TOP_TRIANGLE_FAN = 0x6, - TOP_QUAD_LIST = 0x7, - TOP_QUAD_STRIP = 0x8, - TOP_LINE_LIST_ADJ = 0x9, - TOP_LISTSTRIP_ADJ = 0xA, - TOP_TRI_LIST_ADJ = 0xB, - TOP_TRI_STRIP_ADJ = 0xC, - TOP_TRI_STRIP_REVERSE = 0xD, - TOP_POLYGON = 0xE, - TOP_RECT_LIST = 0xF, - TOP_LINE_LOOP = 0x10, - TOP_POINT_LIST_BF = 0x11, - TOP_LINE_STRIP_CONT = 0x12, - TOP_LINE_STRIP_BF = 0x13, - TOP_LINE_STRIP_CONT_BF = 0x14, - TOP_TRIANGLE_FAN_NOSTIPPLE = 0x16, - TOP_TRIANGLE_DISC = 0x17, /// @todo What is this?? - - TOP_PATCHLIST_BASE = 0x1F, // Invalid topology, used to calculate num verts for a patchlist. - TOP_PATCHLIST_1 = 0x20, // List of 1-vertex patches - TOP_PATCHLIST_2 = 0x21, - TOP_PATCHLIST_3 = 0x22, - TOP_PATCHLIST_4 = 0x23, - TOP_PATCHLIST_5 = 0x24, - TOP_PATCHLIST_6 = 0x25, - TOP_PATCHLIST_7 = 0x26, - TOP_PATCHLIST_8 = 0x27, - TOP_PATCHLIST_9 = 0x28, - TOP_PATCHLIST_10 = 0x29, - TOP_PATCHLIST_11 = 0x2A, - TOP_PATCHLIST_12 = 0x2B, - TOP_PATCHLIST_13 = 0x2C, - TOP_PATCHLIST_14 = 0x2D, - TOP_PATCHLIST_15 = 0x2E, - TOP_PATCHLIST_16 = 0x2F, - TOP_PATCHLIST_17 = 0x30, - TOP_PATCHLIST_18 = 0x31, - TOP_PATCHLIST_19 = 0x32, - TOP_PATCHLIST_20 = 0x33, - TOP_PATCHLIST_21 = 0x34, - TOP_PATCHLIST_22 = 0x35, - TOP_PATCHLIST_23 = 0x36, - TOP_PATCHLIST_24 = 0x37, - TOP_PATCHLIST_25 = 0x38, - TOP_PATCHLIST_26 = 0x39, - TOP_PATCHLIST_27 = 0x3A, - TOP_PATCHLIST_28 = 0x3B, - TOP_PATCHLIST_29 = 0x3C, - TOP_PATCHLIST_30 = 0x3D, - TOP_PATCHLIST_31 = 0x3E, - TOP_PATCHLIST_32 = 0x3F, // List of 32-vertex patches -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_SHADER_TYPE -////////////////////////////////////////////////////////////////////////// -enum SWR_SHADER_TYPE -{ - SHADER_VERTEX, - SHADER_GEOMETRY, - SHADER_DOMAIN, - SHADER_HULL, - SHADER_PIXEL, - SHADER_COMPUTE, - - NUM_SHADER_TYPES, -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_RENDERTARGET_ATTACHMENT -/// @todo Its not clear what an "attachment" means. Its not common term. -////////////////////////////////////////////////////////////////////////// -enum SWR_RENDERTARGET_ATTACHMENT -{ - SWR_ATTACHMENT_COLOR0, - SWR_ATTACHMENT_COLOR1, - SWR_ATTACHMENT_COLOR2, - SWR_ATTACHMENT_COLOR3, - SWR_ATTACHMENT_COLOR4, - SWR_ATTACHMENT_COLOR5, - SWR_ATTACHMENT_COLOR6, - SWR_ATTACHMENT_COLOR7, - SWR_ATTACHMENT_DEPTH, - SWR_ATTACHMENT_STENCIL, - - SWR_NUM_ATTACHMENTS -}; - -#define SWR_NUM_RENDERTARGETS 8 - -#define SWR_ATTACHMENT_COLOR0_BIT 0x001 -#define SWR_ATTACHMENT_COLOR1_BIT 0x002 -#define SWR_ATTACHMENT_COLOR2_BIT 0x004 -#define SWR_ATTACHMENT_COLOR3_BIT 0x008 -#define SWR_ATTACHMENT_COLOR4_BIT 0x010 -#define SWR_ATTACHMENT_COLOR5_BIT 0x020 -#define SWR_ATTACHMENT_COLOR6_BIT 0x040 -#define SWR_ATTACHMENT_COLOR7_BIT 0x080 -#define SWR_ATTACHMENT_DEPTH_BIT 0x100 -#define SWR_ATTACHMENT_STENCIL_BIT 0x200 -#define SWR_ATTACHMENT_MASK_ALL 0x3ff -#define SWR_ATTACHMENT_MASK_COLOR 0x0ff - - -////////////////////////////////////////////////////////////////////////// -/// @brief SWR Inner Tessellation factor ID -/// See above GetTessFactorOutputPosition code for documentation -enum SWR_INNER_TESSFACTOR_ID -{ - SWR_QUAD_U_TRI_INSIDE, - SWR_QUAD_V_INSIDE, - - SWR_NUM_INNER_TESS_FACTORS, -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief SWR Outer Tessellation factor ID -/// See above GetTessFactorOutputPosition code for documentation -enum SWR_OUTER_TESSFACTOR_ID -{ - SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL, - SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY, - SWR_QUAD_V_EQ0_TRI_W, - SWR_QUAD_V_EQ1, - - SWR_NUM_OUTER_TESS_FACTORS, -}; - -///////////////////////////////////////////////////////////////////////// -/// simdvertex -/// @brief Defines a vertex element that holds all the data for SIMD vertices. -/// Contains space for position, SGV, and 32 generic attributes -///////////////////////////////////////////////////////////////////////// -enum SWR_VTX_SLOTS -{ - VERTEX_SGV_SLOT = 0, - VERTEX_SGV_RTAI_COMP = 0, - VERTEX_SGV_VAI_COMP = 1, - VERTEX_SGV_POINT_SIZE_COMP = 2, - VERTEX_POSITION_SLOT = 1, - VERTEX_POSITION_END_SLOT = 1, - VERTEX_CLIPCULL_DIST_LO_SLOT = (1 + VERTEX_POSITION_END_SLOT), // VS writes lower 4 clip/cull dist - VERTEX_CLIPCULL_DIST_HI_SLOT = (2 + VERTEX_POSITION_END_SLOT), // VS writes upper 4 clip/cull dist - VERTEX_ATTRIB_START_SLOT = (3 + VERTEX_POSITION_END_SLOT), - VERTEX_ATTRIB_END_SLOT = (34 + VERTEX_POSITION_END_SLOT), - SWR_VTX_NUM_SLOTS = (1 + VERTEX_ATTRIB_END_SLOT) -}; - -// SoAoSoA -struct simdvertex -{ - simdvector attrib[SWR_VTX_NUM_SLOTS]; -}; - -struct simd16vertex -{ - simd16vector attrib[SWR_VTX_NUM_SLOTS]; -}; - -template <typename SIMD_T> -struct SIMDVERTEX_T -{ - typename SIMD_T::Vec4 attrib[SWR_VTX_NUM_SLOTS]; -}; - -struct SWR_WORKER_DATA -{ - HANDLE hArContext; // handle to the archrast context -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_SHADER_STATS -/// @brief Structure passed to shader for stats collection. -///////////////////////////////////////////////////////////////////////// -struct SWR_SHADER_STATS -{ - uint32_t numInstExecuted; // This is roughly the API instructions executed and not x86. - uint32_t numSampleExecuted; - uint32_t numSampleLExecuted; - uint32_t numSampleBExecuted; - uint32_t numSampleCExecuted; - uint32_t numSampleCLZExecuted; - uint32_t numSampleCDExecuted; - uint32_t numGather4Executed; - uint32_t numGather4CExecuted; - uint32_t numGather4CPOExecuted; - uint32_t numGather4CPOCExecuted; - uint32_t numLodExecuted; -}; - - -////////////////////////////////////////////////////////////////////////// -/// SWR_VS_CONTEXT -/// @brief Input to vertex shader -///////////////////////////////////////////////////////////////////////// -struct SWR_VS_CONTEXT -{ - simdvertex* pVin; // IN: SIMD input vertex data store - simdvertex* pVout; // OUT: SIMD output vertex data store - - uint32_t InstanceID; // IN: Instance ID, constant across all verts of the SIMD - simdscalari VertexID; // IN: Vertex ID - simdscalari mask; // IN: Active mask for shader - - // SIMD16 Frontend fields. - uint32_t AlternateOffset; // IN: amount to offset for interleaving even/odd simd8 in - // simd16vertex output - simd16scalari mask16; // IN: Active mask for shader (16-wide) - simd16scalari VertexID16; // IN: Vertex ID (16-wide) - - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. -}; - -///////////////////////////////////////////////////////////////////////// -/// ScalarCPoint -/// @brief defines a control point element as passed from the output -/// of the hull shader to the input of the domain shader -///////////////////////////////////////////////////////////////////////// -struct ScalarAttrib -{ - float x; - float y; - float z; - float w; -}; - -struct ScalarCPoint -{ - ScalarAttrib attrib[SWR_VTX_NUM_SLOTS]; -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_TESSELLATION_FACTORS -/// @brief Tessellation factors structure (non-vector) -///////////////////////////////////////////////////////////////////////// -struct SWR_TESSELLATION_FACTORS -{ - float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS]; - float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS]; - float pad[2]; -}; - -SWR_STATIC_ASSERT(sizeof(SWR_TESSELLATION_FACTORS) == 32); - -#define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches -struct ScalarPatch -{ - SWR_TESSELLATION_FACTORS tessFactors; - ScalarCPoint cp[MAX_NUM_VERTS_PER_PRIM]; - ScalarCPoint patchData; -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_HS_CONTEXT -/// @brief Input to hull shader -///////////////////////////////////////////////////////////////////////// -struct SWR_HS_CONTEXT -{ - simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data - simdscalari PrimitiveID; // IN: (SIMD) primitive ID generated from the draw call - simdscalari mask; // IN: Active mask for shader - uint32_t outputSize; // IN: Size of HS output (per lane) - ScalarPatch* pCPout; // OUT: Output control point patch SIMD-sized-array of SCALAR patches - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_DS_CONTEXT -/// @brief Input to domain shader -///////////////////////////////////////////////////////////////////////// -struct SWR_DS_CONTEXT -{ - uint32_t PrimitiveID; // IN: (SCALAR) PrimitiveID for the patch associated with the DS invocation - uint32_t vectorOffset; // IN: (SCALAR) vector index offset into SIMD data. - uint32_t vectorStride; // IN: (SCALAR) stride (in vectors) of output data per attribute-component - uint32_t outVertexAttribOffset; // IN: (SCALAR) Offset to the attributes as processed by the next shader stage. - ScalarPatch* pCpIn; // IN: (SCALAR) Control patch - simdscalar* pDomainU; // IN: (SIMD) Domain Point U coords - simdscalar* pDomainV; // IN: (SIMD) Domain Point V coords - simdscalari mask; // IN: Active mask for shader - simdscalar* pOutputData; // OUT: (SIMD) Vertex Attributes (2D array of vectors, one row per attribute-component) - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_GS_CONTEXT -/// @brief Input to geometry shader. -///////////////////////////////////////////////////////////////////////// -struct SWR_GS_CONTEXT -{ - simdvector* pVerts; // IN: input primitive data for SIMD prims - uint32_t inputVertStride; // IN: input vertex stride, in attributes - simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call - uint32_t InstanceID; // IN: input instance ID - simdscalari mask; // IN: Active mask for shader - uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains vertices for all output streams) - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. -}; - -struct PixelPositions -{ - simdscalar UL; - simdscalar center; - simdscalar sample; - simdscalar centroid; -}; - -#define SWR_MAX_NUM_MULTISAMPLES 16 - -////////////////////////////////////////////////////////////////////////// -/// SWR_PS_CONTEXT -/// @brief Input to pixel shader. -///////////////////////////////////////////////////////////////////////// -struct SWR_PS_CONTEXT -{ - PixelPositions vX; // IN: x location(s) of pixels - PixelPositions vY; // IN: x location(s) of pixels - simdscalar vZ; // INOUT: z location of pixels - simdscalari activeMask; // OUT: mask for kill - simdscalar inputMask; // IN: input coverage mask for all samples - simdscalari oMask; // OUT: mask for output coverage - - PixelPositions vI; // barycentric coords evaluated at pixel center, sample position, centroid - PixelPositions vJ; - PixelPositions vOneOverW; // IN: 1/w - - const float* pAttribs; // IN: pointer to attribute barycentric coefficients - const float* pPerspAttribs; // IN: pointer to attribute/w barycentric coefficients - const float* pRecipW; // IN: pointer to 1/w coord for each vertex - const float* I; // IN: Barycentric A, B, and C coefs used to compute I - const float* J; // IN: Barycentric A, B, and C coefs used to compute J - float recipDet; // IN: 1/Det, used when barycentric interpolating attributes - const float* pSamplePosX; // IN: array of sample positions - const float* pSamplePosY; // IN: array of sample positions - simdvector shaded[SWR_NUM_RENDERTARGETS]; // OUT: result color per rendertarget - - uint32_t frontFace; // IN: front- 1, back- 0 - uint32_t sampleIndex; // IN: sampleIndex - uint32_t renderTargetArrayIndex; // IN: render target array index from GS - uint32_t viewportIndex; // IN: viewport index from GS - uint32_t rasterizerSampleCount; // IN: sample count used by the rasterizer - - uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; // IN: Pointers to render target hottiles - - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. - - BucketManager *pBucketManager; // @llvm_struct - IN: performance buckets. -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_CS_CONTEXT -/// @brief Input to compute shader. -///////////////////////////////////////////////////////////////////////// -struct SWR_CS_CONTEXT -{ - // The ThreadGroupId is the current thread group index relative - // to all thread groups in the Dispatch call. The ThreadId, ThreadIdInGroup, - // and ThreadIdInGroupFlattened can be derived from ThreadGroupId in the shader. - - // Compute shader accepts the following system values. - // o ThreadId - Current thread id relative to all other threads in dispatch. - // o ThreadGroupId - Current thread group id relative to all other groups in dispatch. - // o ThreadIdInGroup - Current thread relative to all threads in the current thread group. - // o ThreadIdInGroupFlattened - Flattened linear id derived from ThreadIdInGroup. - // - // All of these system values can be computed in the shader. They will be - // derived from the current tile counter. The tile counter is an atomic counter that - // resides in the draw context and is initialized to the product of the dispatch dims. - // - // tileCounter = dispatchDims.x * dispatchDims.y * dispatchDims.z - // - // Each CPU worker thread will atomically decrement this counter and passes the current - // count into the shader. When the count reaches 0 then all thread groups in the - // dispatch call have been completed. - - uint32_t tileCounter; // The tile counter value for this thread group. - - // Dispatch dimensions used by shader to compute system values from the tile counter. - uint32_t dispatchDims[3]; - - uint8_t* pTGSM; // Thread Group Shared Memory pointer. - uint8_t* pSpillFillBuffer; // Spill/fill buffer for barrier support - uint8_t* pScratchSpace; // Pointer to scratch space buffer used by the shader, shader is - // responsible for subdividing scratch space per instance/simd - uint32_t scratchSpacePerWarp; // Scratch space per work item x SIMD_WIDTH - - SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. -}; - -// enums -enum SWR_TILE_MODE -{ - SWR_TILE_NONE = 0x0, // Linear mode (no tiling) - SWR_TILE_MODE_WMAJOR, // W major tiling - SWR_TILE_MODE_XMAJOR, // X major tiling - SWR_TILE_MODE_YMAJOR, // Y major tiling - SWR_TILE_SWRZ, // SWR-Z tiling - - - SWR_TILE_MODE_COUNT -}; - -enum SWR_SURFACE_TYPE -{ - SURFACE_1D = 0, - SURFACE_2D = 1, - SURFACE_3D = 2, - SURFACE_CUBE = 3, - SURFACE_BUFFER = 4, - SURFACE_STRUCTURED_BUFFER = 5, - SURFACE_NULL = 7 -}; - -enum SWR_ZFUNCTION -{ - ZFUNC_ALWAYS, - ZFUNC_NEVER, - ZFUNC_LT, - ZFUNC_EQ, - ZFUNC_LE, - ZFUNC_GT, - ZFUNC_NE, - ZFUNC_GE, - NUM_ZFUNC -}; - -enum SWR_STENCILOP -{ - STENCILOP_KEEP, - STENCILOP_ZERO, - STENCILOP_REPLACE, - STENCILOP_INCRSAT, - STENCILOP_DECRSAT, - STENCILOP_INCR, - STENCILOP_DECR, - STENCILOP_INVERT -}; - -enum SWR_BLEND_FACTOR -{ - BLENDFACTOR_ONE, - BLENDFACTOR_SRC_COLOR, - BLENDFACTOR_SRC_ALPHA, - BLENDFACTOR_DST_ALPHA, - BLENDFACTOR_DST_COLOR, - BLENDFACTOR_SRC_ALPHA_SATURATE, - BLENDFACTOR_CONST_COLOR, - BLENDFACTOR_CONST_ALPHA, - BLENDFACTOR_SRC1_COLOR, - BLENDFACTOR_SRC1_ALPHA, - BLENDFACTOR_ZERO, - BLENDFACTOR_INV_SRC_COLOR, - BLENDFACTOR_INV_SRC_ALPHA, - BLENDFACTOR_INV_DST_ALPHA, - BLENDFACTOR_INV_DST_COLOR, - BLENDFACTOR_INV_CONST_COLOR, - BLENDFACTOR_INV_CONST_ALPHA, - BLENDFACTOR_INV_SRC1_COLOR, - BLENDFACTOR_INV_SRC1_ALPHA -}; - -enum SWR_BLEND_OP -{ - BLENDOP_ADD, - BLENDOP_SUBTRACT, - BLENDOP_REVSUBTRACT, - BLENDOP_MIN, - BLENDOP_MAX, -}; - -enum SWR_LOGIC_OP -{ - LOGICOP_CLEAR, - LOGICOP_NOR, - LOGICOP_AND_INVERTED, - LOGICOP_COPY_INVERTED, - LOGICOP_AND_REVERSE, - LOGICOP_INVERT, - LOGICOP_XOR, - LOGICOP_NAND, - LOGICOP_AND, - LOGICOP_EQUIV, - LOGICOP_NOOP, - LOGICOP_OR_INVERTED, - LOGICOP_COPY, - LOGICOP_OR_REVERSE, - LOGICOP_OR, - LOGICOP_SET, -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_AUX_MODE -/// @brief Specifies how the auxiliary buffer is used by the driver. -////////////////////////////////////////////////////////////////////////// -enum SWR_AUX_MODE -{ - AUX_MODE_NONE, - AUX_MODE_COLOR, - AUX_MODE_UAV, - AUX_MODE_DEPTH, -}; - -// vertex fetch state -// WARNING- any changes to this struct need to be reflected -// in the fetch shader jit -struct SWR_VERTEX_BUFFER_STATE -{ - gfxptr_t xpData; - uint32_t index; - uint32_t pitch; - uint32_t size; - uint32_t minVertex; // min vertex (for bounds checking) - uint32_t maxVertex; // size / pitch. precalculated value used by fetch shader for OOB checks - uint32_t partialInboundsSize; // size % pitch. precalculated value used by fetch shader for - // partially OOB vertices -}; - -struct SWR_INDEX_BUFFER_STATE -{ - gfxptr_t xpIndices; - // Format type for indices (e.g. UINT16, UINT32, etc.) - SWR_FORMAT format; // @llvm_enum - uint32_t size; -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_FETCH_CONTEXT -/// @brief Input to fetch shader. -/// @note WARNING - Changes to this struct need to be reflected in the -/// fetch shader jit. -///////////////////////////////////////////////////////////////////////// -struct SWR_FETCH_CONTEXT -{ - const SWR_VERTEX_BUFFER_STATE* pStreams; // IN: array of bound vertex buffers - gfxptr_t xpIndices; // IN: pointer to int32 index buffer for indexed draws - gfxptr_t xpLastIndex; // IN: pointer to end of index buffer, used for bounds checking - uint32_t CurInstance; // IN: current instance - uint32_t BaseVertex; // IN: base vertex - uint32_t StartVertex; // IN: start vertex - uint32_t StartInstance; // IN: start instance - simdscalari VertexID; // OUT: vector of vertex IDs - simdscalari CutMask; // OUT: vector mask of indices which have the cut index value -#if USE_SIMD16_SHADERS - // simd16scalari VertexID; // OUT: vector of vertex IDs - // simd16scalari CutMask; // OUT: vector mask of indices which have the - // cut index value - simdscalari VertexID2; // OUT: vector of vertex IDs - simdscalari CutMask2; // OUT: vector mask of indices which have the cut index value -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_STATS -/// -/// @brief All statistics generated by SWR go here. These are public -/// to driver. -///////////////////////////////////////////////////////////////////////// -OSALIGNLINE(struct) SWR_STATS -{ - // Occlusion Query - uint64_t DepthPassCount; // Number of passing depth tests. Not exact. - - // Pipeline Stats - uint64_t PsInvocations; // Number of Pixel Shader invocations - uint64_t CsInvocations; // Number of Compute Shader invocations - -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_STATS -/// -/// @brief All statistics generated by FE. -///////////////////////////////////////////////////////////////////////// -OSALIGNLINE(struct) SWR_STATS_FE -{ - uint64_t IaVertices; // Number of Fetch Shader vertices - uint64_t IaPrimitives; // Number of PA primitives. - uint64_t VsInvocations; // Number of Vertex Shader invocations - uint64_t HsInvocations; // Number of Hull Shader invocations - uint64_t DsInvocations; // Number of Domain Shader invocations - uint64_t GsInvocations; // Number of Geometry Shader invocations - uint64_t GsPrimitives; // Number of prims GS outputs. - uint64_t CInvocations; // Number of clipper invocations - uint64_t CPrimitives; // Number of clipper primitives. - - // Streamout Stats - uint64_t SoPrimStorageNeeded[4]; - uint64_t SoNumPrimsWritten[4]; -}; - - ////////////////////////////////////////////////////////////////////////// - /// STREAMOUT_BUFFERS - ///////////////////////////////////////////////////////////////////////// - -#define MAX_SO_STREAMS 4 -#define MAX_SO_BUFFERS 4 -#define MAX_ATTRIBUTES 32 - -struct SWR_STREAMOUT_BUFFER -{ - // Pointers to streamout buffers. - gfxptr_t pBuffer; - - // Offset to the SO write offset. If not null then we update offset here. - gfxptr_t pWriteOffset; - - bool enable; - bool soWriteEnable; - - // Size of buffer in dwords. - uint32_t bufferSize; - - // Vertex pitch of buffer in dwords. - uint32_t pitch; - - // Offset into buffer in dwords. SOS will increment this offset. - uint32_t streamOffset; -}; - -////////////////////////////////////////////////////////////////////////// -/// STREAMOUT_STATE -///////////////////////////////////////////////////////////////////////// -struct SWR_STREAMOUT_STATE -{ - // This disables stream output. - bool soEnable; - - // which streams are enabled for streamout - bool streamEnable[MAX_SO_STREAMS]; - - // If set then do not send any streams to the rasterizer. - bool rasterizerDisable; - - // Specifies which stream to send to the rasterizer. - uint32_t streamToRasterizer; - - // The stream masks specify which attributes are sent to which streams. - // These masks help the FE to setup the pPrimData buffer that is passed - // the Stream Output Shader (SOS) function. - uint64_t streamMasks[MAX_SO_STREAMS]; - - // Number of attributes, including position, per vertex that are streamed out. - // This should match number of bits in stream mask. - uint32_t streamNumEntries[MAX_SO_STREAMS]; - - // Offset to the start of the attributes of the input vertices, in simdvector units - uint32_t vertexAttribOffset[MAX_SO_STREAMS]; -}; - -////////////////////////////////////////////////////////////////////////// -/// STREAMOUT_CONTEXT - Passed to SOS -///////////////////////////////////////////////////////////////////////// -struct SWR_STREAMOUT_CONTEXT -{ - uint32_t* pPrimData; - SWR_STREAMOUT_BUFFER* pBuffer[MAX_SO_STREAMS]; - - // Num prims written for this stream - uint32_t numPrimsWritten; - - // Num prims that should have been written if there were no overflow. - uint32_t numPrimStorageNeeded; -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_GS_STATE - Geometry shader state -///////////////////////////////////////////////////////////////////////// -struct SWR_GS_STATE -{ - bool gsEnable; - - // If true, geometry shader emits a single stream, with separate cut buffer. - // If false, geometry shader emits vertices for multiple streams to the stream buffer, with a - // separate StreamID buffer to map vertices to streams - bool isSingleStream; - - // Number of input attributes per vertex. Used by the frontend to - // optimize assembling primitives for GS - uint32_t numInputAttribs; - - // Stride of incoming verts in attributes - uint32_t inputVertStride; - - // Output topology - can be point, tristrip, linestrip, or rectlist - PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum - - // Maximum number of verts that can be emitted by a single instance of the GS - uint32_t maxNumVerts; - - // Instance count - uint32_t instanceCount; - - // When single stream is enabled, singleStreamID dictates which stream is being output. - // field ignored if isSingleStream is false - uint32_t singleStreamID; - - // Total amount of memory to allocate for one instance of the shader output in bytes - uint32_t allocationSize; - - // Offset to start reading data per input vertex in simdvector units. This can be used to - // skip over any vertex data output from the previous stage that is unused in the GS, removing - // unnecessary vertex processing. - uint32_t vertexAttribOffset; - - // Size of the control data section which contains cut or streamID data, in simdscalar units. - // Should be sized to handle the maximum number of verts output by the GS. Can be 0 if there are - // no cuts or streamID bits. - uint32_t controlDataSize; - - // Offset to the control data section, in bytes - uint32_t controlDataOffset; - - // Total size of an output vertex, in simdvector units - uint32_t outputVertexSize; - - // Offset to the start of the vertex section, in bytes - uint32_t outputVertexOffset; - - // Set this to non-zero to indicate that the shader outputs a static number of verts. If zero, - // shader is expected to store the final vertex count in the first dword of the gs output - // stream. - uint32_t staticVertexCount; -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS -///////////////////////////////////////////////////////////////////////// -enum SWR_TS_OUTPUT_TOPOLOGY -{ - SWR_TS_OUTPUT_POINT, - SWR_TS_OUTPUT_LINE, - SWR_TS_OUTPUT_TRI_CW, - SWR_TS_OUTPUT_TRI_CCW, - - SWR_TS_OUTPUT_TOPOLOGY_COUNT -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_TS_PARTITIONING - Defines tessellation algorithm -///////////////////////////////////////////////////////////////////////// -enum SWR_TS_PARTITIONING -{ - SWR_TS_INTEGER, - SWR_TS_ODD_FRACTIONAL, - SWR_TS_EVEN_FRACTIONAL, - - SWR_TS_PARTITIONING_COUNT -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_TS_DOMAIN - Defines Tessellation Domain -///////////////////////////////////////////////////////////////////////// -enum SWR_TS_DOMAIN -{ - SWR_TS_QUAD, - SWR_TS_TRI, - SWR_TS_ISOLINE, - - SWR_TS_DOMAIN_COUNT -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_TS_STATE - Tessellation state -///////////////////////////////////////////////////////////////////////// -struct SWR_TS_STATE -{ - bool tsEnable; - - SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology; // @llvm_enum - SWR_TS_PARTITIONING partitioning; // @llvm_enum - SWR_TS_DOMAIN domain; // @llvm_enum - - PRIMITIVE_TOPOLOGY postDSTopology; // @llvm_enum - - uint32_t numHsInputAttribs; - uint32_t numHsOutputAttribs; - uint32_t hsAllocationSize; // Size of HS output in bytes, per lane - - uint32_t numDsOutputAttribs; - uint32_t dsAllocationSize; - uint32_t dsOutVtxAttribOffset; - - // Offset to the start of the attributes of the input vertices, in simdvector units - uint32_t srcVertexAttribOffset; - - // Offset to the start of the attributes expected by the hull shader - uint32_t vertexAttribOffset; -}; - -// output merger state -struct SWR_RENDER_TARGET_BLEND_STATE -{ - uint8_t writeDisableRed : 1; - uint8_t writeDisableGreen : 1; - uint8_t writeDisableBlue : 1; - uint8_t writeDisableAlpha : 1; -}; -static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, - "Invalid SWR_RENDER_TARGET_BLEND_STATE size"); - -enum SWR_MULTISAMPLE_COUNT -{ - SWR_MULTISAMPLE_1X = 0, - SWR_MULTISAMPLE_2X, - SWR_MULTISAMPLE_4X, - SWR_MULTISAMPLE_8X, - SWR_MULTISAMPLE_16X, - SWR_MULTISAMPLE_TYPE_COUNT -}; - -static INLINE uint32_t GetNumSamples(/* SWR_SAMPLE_COUNT */ int sampleCountEnum) // @llvm_func_start -{ - return uint32_t(1) << sampleCountEnum; -} // @llvm_func_end - -struct SWR_BLEND_STATE -{ - // constant blend factor color in RGBA float - float constantColor[4]; - - // alpha test reference value in unorm8 or float32 - uint32_t alphaTestReference; - uint32_t sampleMask; - // all RT's have the same sample count - ///@todo move this to Output Merger state when we refactor - SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum - - SWR_RENDER_TARGET_BLEND_STATE renderTarget[SWR_NUM_RENDERTARGETS]; -}; -static_assert(sizeof(SWR_BLEND_STATE) == 36, "Invalid SWR_BLEND_STATE size"); - -struct SWR_BLEND_CONTEXT -{ - const SWR_BLEND_STATE* pBlendState; - simdvector* src; - simdvector* src1; - simdvector* src0alpha; - uint32_t sampleNum; - simdvector* pDst; - simdvector* result; - simdscalari* oMask; - simdscalari* pMask; - uint32_t isAlphaTested; - uint32_t isAlphaBlended; -}; - -////////////////////////////////////////////////////////////////////////// -/// FUNCTION POINTERS FOR SHADERS - -#if USE_SIMD16_SHADERS -typedef void(__cdecl *PFN_FETCH_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_FETCH_CONTEXT& fetchInfo, simd16vertex& out); -#else -typedef void(__cdecl *PFN_FETCH_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); -#endif -typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_VS_CONTEXT* pVsContext); -typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_HS_CONTEXT* pHsContext); -typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_DS_CONTEXT* pDsContext); -typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext); -typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext); -typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_STREAMOUT_CONTEXT& soContext); -typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext); -typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext); -typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*); -typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar const &); - - -////////////////////////////////////////////////////////////////////////// -/// FRONTEND_STATE -///////////////////////////////////////////////////////////////////////// -struct SWR_FRONTEND_STATE -{ - // skip clip test, perspective divide, and viewport transform - // intended for verts in screen space - bool vpTransformDisable; - bool bEnableCutIndex; - union - { - struct - { - uint32_t triFan : 2; - uint32_t lineStripList : 1; - uint32_t triStripList : 2; - }; - uint32_t bits; - } provokingVertex; - uint32_t topologyProvokingVertex; // provoking vertex for the draw topology - - // Size of a vertex in simdvector units. Should be sized to the - // maximum of the input/output of the vertex shader. - uint32_t vsVertexSize; -}; - -////////////////////////////////////////////////////////////////////////// -/// VIEWPORT_MATRIX -///////////////////////////////////////////////////////////////////////// -struct SWR_VIEWPORT_MATRIX -{ - float m00; - float m11; - float m22; - float m30; - float m31; - float m32; -}; - -////////////////////////////////////////////////////////////////////////// -/// VIEWPORT_MATRIXES -///////////////////////////////////////////////////////////////////////// -struct SWR_VIEWPORT_MATRICES -{ - float m00[KNOB_NUM_VIEWPORTS_SCISSORS]; - float m11[KNOB_NUM_VIEWPORTS_SCISSORS]; - float m22[KNOB_NUM_VIEWPORTS_SCISSORS]; - float m30[KNOB_NUM_VIEWPORTS_SCISSORS]; - float m31[KNOB_NUM_VIEWPORTS_SCISSORS]; - float m32[KNOB_NUM_VIEWPORTS_SCISSORS]; -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_VIEWPORT -///////////////////////////////////////////////////////////////////////// -struct SWR_VIEWPORT -{ - float x; - float y; - float width; - float height; - float minZ; - float maxZ; -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_CULLMODE -////////////////////////////////////////////////////////////////////////// -enum SWR_CULLMODE -{ - SWR_CULLMODE_BOTH, - SWR_CULLMODE_NONE, - SWR_CULLMODE_FRONT, - SWR_CULLMODE_BACK -}; - -enum SWR_FILLMODE -{ - SWR_FILLMODE_POINT, - SWR_FILLMODE_WIREFRAME, - SWR_FILLMODE_SOLID -}; - -enum SWR_FRONTWINDING -{ - SWR_FRONTWINDING_CW, - SWR_FRONTWINDING_CCW -}; - - -enum SWR_PIXEL_LOCATION -{ - SWR_PIXEL_LOCATION_CENTER, - SWR_PIXEL_LOCATION_UL, -}; - -// fixed point screen space sample locations within a pixel -struct SWR_MULTISAMPLE_POS -{ -public: - INLINE void SetXi(uint32_t sampleNum, uint32_t val) { _xi[sampleNum] = val; }; // @llvm_func - INLINE void SetYi(uint32_t sampleNum, uint32_t val) { _yi[sampleNum] = val; }; // @llvm_func - INLINE uint32_t Xi(uint32_t sampleNum) const { return _xi[sampleNum]; }; // @llvm_func - INLINE uint32_t Yi(uint32_t sampleNum) const { return _yi[sampleNum]; }; // @llvm_func - INLINE void SetX(uint32_t sampleNum, float val) { _x[sampleNum] = val; }; // @llvm_func - INLINE void SetY(uint32_t sampleNum, float val) { _y[sampleNum] = val; }; // @llvm_func - INLINE float X(uint32_t sampleNum) const { return _x[sampleNum]; }; // @llvm_func - INLINE float Y(uint32_t sampleNum) const { return _y[sampleNum]; }; // @llvm_func - typedef const float (&sampleArrayT)[SWR_MAX_NUM_MULTISAMPLES]; //@llvm_typedef - INLINE sampleArrayT X() const { return _x; }; // @llvm_func - INLINE sampleArrayT Y() const { return _y; }; // @llvm_func - INLINE const __m128i& vXi(uint32_t sampleNum) const { return _vXi[sampleNum]; }; // @llvm_func - INLINE const __m128i& vYi(uint32_t sampleNum) const { return _vYi[sampleNum]; }; // @llvm_func - INLINE const simdscalar& vX(uint32_t sampleNum) const { return _vX[sampleNum]; }; // @llvm_func - INLINE const simdscalar& vY(uint32_t sampleNum) const { return _vY[sampleNum]; }; // @llvm_func - INLINE const __m128i& TileSampleOffsetsX() const { return tileSampleOffsetsX; }; // @llvm_func - INLINE const __m128i& TileSampleOffsetsY() const { return tileSampleOffsetsY; }; // @llvm_func - - INLINE void PrecalcSampleData(int numSamples); //@llvm_func - -private: - template <typename MaskT> - INLINE __m128i expandThenBlend4(uint32_t* min, uint32_t* max); // @llvm_func - INLINE void CalcTileSampleOffsets(int numSamples); // @llvm_func - - // scalar sample values - uint32_t _xi[SWR_MAX_NUM_MULTISAMPLES]; - uint32_t _yi[SWR_MAX_NUM_MULTISAMPLES]; - float _x[SWR_MAX_NUM_MULTISAMPLES]; - float _y[SWR_MAX_NUM_MULTISAMPLES]; - - // precalc'd / vectorized samples - __m128i _vXi[SWR_MAX_NUM_MULTISAMPLES]; - __m128i _vYi[SWR_MAX_NUM_MULTISAMPLES]; - simdscalar _vX[SWR_MAX_NUM_MULTISAMPLES]; - simdscalar _vY[SWR_MAX_NUM_MULTISAMPLES]; - __m128i tileSampleOffsetsX; - __m128i tileSampleOffsetsY; -}; - -////////////////////////////////////////////////////////////////////////// -/// SWR_RASTSTATE -////////////////////////////////////////////////////////////////////////// -struct SWR_RASTSTATE -{ - uint32_t cullMode : 2; - uint32_t fillMode : 2; - uint32_t frontWinding : 1; - uint32_t scissorEnable : 1; - uint32_t depthClipEnable : 1; - uint32_t clipEnable : 1; - uint32_t clipHalfZ : 1; - uint32_t pointParam : 1; - uint32_t pointSpriteEnable : 1; - uint32_t pointSpriteTopOrigin : 1; - uint32_t forcedSampleCount : 1; - uint32_t pixelOffset : 1; - uint32_t depthBiasPreAdjusted : 1; ///< depth bias constant is in float units, not per-format Z units - uint32_t conservativeRast : 1; - - float pointSize; - float lineWidth; - - float depthBias; - float slopeScaledDepthBias; - float depthBiasClamp; - SWR_FORMAT depthFormat; // @llvm_enum - - // sample count the rasterizer is running at - SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum - uint32_t pixelLocation; // UL or Center - SWR_MULTISAMPLE_POS samplePositions; // @llvm_struct - bool bIsCenterPattern; // @llvm_enum -}; - - -enum SWR_CONSTANT_SOURCE -{ - SWR_CONSTANT_SOURCE_CONST_0000, - SWR_CONSTANT_SOURCE_CONST_0001_FLOAT, - SWR_CONSTANT_SOURCE_CONST_1111_FLOAT, - SWR_CONSTANT_SOURCE_PRIM_ID -}; - -struct SWR_ATTRIB_SWIZZLE -{ - uint16_t sourceAttrib : 5; // source attribute - uint16_t constantSource : 2; // constant source to apply - uint16_t componentOverrideMask : 4; // override component with constant source -}; - -// backend state -struct SWR_BACKEND_STATE -{ - uint32_t constantInterpolationMask; // bitmask indicating which attributes have constant - // interpolation - uint32_t pointSpriteTexCoordMask; // bitmask indicating the attribute(s) which should be - // interpreted as tex coordinates - - bool swizzleEnable; // when enabled, core will parse the swizzle map when - // setting up attributes for the backend, otherwise - // all attributes up to numAttributes will be sent - uint8_t numAttributes; // total number of attributes to send to backend (up to 32) - uint8_t numComponents[32]; // number of components to setup per attribute, this reduces some - // calculations for unneeded components - - bool readRenderTargetArrayIndex; // Forward render target array index from last FE stage to the - // backend - bool readViewportArrayIndex; // Read viewport array index from last FE stage during binning - - // User clip/cull distance enables - uint8_t cullDistanceMask; - uint8_t clipDistanceMask; - - // padding to ensure swizzleMap starts 64B offset from start of the struct - // and that the next fields are dword aligned. - uint8_t pad[10]; - - // Offset to the start of the attributes of the input vertices, in simdvector units - uint32_t vertexAttribOffset; - - // Offset to clip/cull attrib section of the vertex, in simdvector units - uint32_t vertexClipCullOffset; - - SWR_ATTRIB_SWIZZLE swizzleMap[32]; -}; -static_assert(sizeof(SWR_BACKEND_STATE) == 128, - "Adjust padding to keep size (or remove this assert)"); - - -union SWR_DEPTH_STENCIL_STATE -{ - struct - { - // dword 0 - uint32_t depthWriteEnable : 1; - uint32_t depthTestEnable : 1; - uint32_t stencilWriteEnable : 1; - uint32_t stencilTestEnable : 1; - uint32_t doubleSidedStencilTestEnable : 1; - - uint32_t depthTestFunc : 3; - uint32_t stencilTestFunc : 3; - - uint32_t backfaceStencilPassDepthPassOp : 3; - uint32_t backfaceStencilPassDepthFailOp : 3; - uint32_t backfaceStencilFailOp : 3; - uint32_t backfaceStencilTestFunc : 3; - uint32_t stencilPassDepthPassOp : 3; - uint32_t stencilPassDepthFailOp : 3; - uint32_t stencilFailOp : 3; - - // dword 1 - uint8_t backfaceStencilWriteMask; - uint8_t backfaceStencilTestMask; - uint8_t stencilWriteMask; - uint8_t stencilTestMask; - - // dword 2 - uint8_t backfaceStencilRefValue; - uint8_t stencilRefValue; - }; - uint32_t value[3]; -}; - -enum SWR_SHADING_RATE -{ - SWR_SHADING_RATE_PIXEL, - SWR_SHADING_RATE_SAMPLE, - SWR_SHADING_RATE_COUNT, -}; - -enum SWR_INPUT_COVERAGE -{ - SWR_INPUT_COVERAGE_NONE, - SWR_INPUT_COVERAGE_NORMAL, - SWR_INPUT_COVERAGE_INNER_CONSERVATIVE, - SWR_INPUT_COVERAGE_COUNT, -}; - -enum SWR_PS_POSITION_OFFSET -{ - SWR_PS_POSITION_SAMPLE_NONE, - SWR_PS_POSITION_SAMPLE_OFFSET, - SWR_PS_POSITION_CENTROID_OFFSET, - SWR_PS_POSITION_OFFSET_COUNT, -}; - -enum SWR_BARYCENTRICS_MASK -{ - SWR_BARYCENTRIC_PER_PIXEL_MASK = 0x1, - SWR_BARYCENTRIC_CENTROID_MASK = 0x2, - SWR_BARYCENTRIC_PER_SAMPLE_MASK = 0x4, -}; - -// pixel shader state -struct SWR_PS_STATE -{ - // dword 0-1 - PFN_PIXEL_KERNEL pfnPixelShader; // @llvm_pfn - - // dword 2 - uint32_t killsPixel : 1; // pixel shader can kill pixels - uint32_t inputCoverage : 2; // ps uses input coverage - uint32_t writesODepth : 1; // pixel shader writes to depth - uint32_t usesSourceDepth : 1; // pixel shader reads depth - uint32_t shadingRate : 2; // shading per pixel / sample / coarse pixel - uint32_t posOffset : 2; // type of offset (none, sample, centroid) to add to pixel position - uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate - // attributes with - uint32_t usesUAV : 1; // pixel shader accesses UAV - uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test - - uint8_t renderTargetMask; // Mask of render targets written -}; - -// depth bounds state -struct SWR_DEPTH_BOUNDS_STATE -{ - bool depthBoundsTestEnable; - float depthBoundsTestMinValue; - float depthBoundsTestMaxValue; -}; -// clang-format on diff --git a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h deleted file mode 100644 index 99eac835ea8..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h +++ /dev/null @@ -1,67 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file state.h - * - * @brief Definitions for API state - complex function implementation. - * - ******************************************************************************/ -#pragma once - -#include "core/state.h" -#include "common/simdintrin.h" - -template <typename MaskT> -INLINE __m128i SWR_MULTISAMPLE_POS::expandThenBlend4(uint32_t* min, uint32_t* max) -{ - __m128i vMin = _mm_set1_epi32(*min); - __m128i vMax = _mm_set1_epi32(*max); - return _simd_blend4_epi32<MaskT::value>(vMin, vMax); -} - -INLINE void SWR_MULTISAMPLE_POS::PrecalcSampleData(int numSamples) -{ - for (int i = 0; i < numSamples; i++) - { - _vXi[i] = _mm_set1_epi32(_xi[i]); - _vYi[i] = _mm_set1_epi32(_yi[i]); - _vX[i] = _simd_set1_ps(_x[i]); - _vY[i] = _simd_set1_ps(_y[i]); - } - // precalculate the raster tile BB for the rasterizer. - CalcTileSampleOffsets(numSamples); -} - -INLINE void SWR_MULTISAMPLE_POS::CalcTileSampleOffsets(int numSamples) -{ - auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]); - auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]); - using xMask = std::integral_constant<int, 0xA>; - // BR(max), BL(min), UR(max), UL(min) - tileSampleOffsetsX = expandThenBlend4<xMask>(minXi, maxXi); - - auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]); - auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]); - using yMask = std::integral_constant<int, 0xC>; - // BR(max), BL(min), UR(max), UL(min) - tileSampleOffsetsY = expandThenBlend4<yMask>(minYi, maxYi); -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.cpp b/src/gallium/drivers/swr/rasterizer/core/tessellator.cpp deleted file mode 100644 index 08f2bce339c..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/tessellator.cpp +++ /dev/null @@ -1,2689 +0,0 @@ -/* - Copyright (c) Microsoft Corporation - - Permission is hereby granted, free of charge, to any person obtaining a copy of this software and - associated documentation files (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, - and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all copies or substantial - portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT - NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#include "tessellator.hpp" -#if defined(_MSC_VER) -#include <math.h> // ceil -#else -#include <cmath> -#endif -//#include <windows.h> // Just used for some commented out debug stat printing. -//#include <strsafe.h> // Ditto. -#define min(x,y) (x < y ? x : y) -#define max(x,y) (x > y ? x : y) - -//================================================================================================================================= -// Some D3D Compliant Float Math (reference rasterizer implements these in RefALU class) -//================================================================================================================================= -// -//--------------------------------------------------------------------------------------------------------------------------------- -// isNaN -//--------------------------------------------------------------------------------------------------------------------------------- -static bool tess_isNaN( float a ) -{ - static const int exponentMask = 0x7f800000; - static const int mantissaMask = 0x007fffff; - int u = *(int*)&a; - return ( ( ( u & exponentMask ) == exponentMask ) && ( u & mantissaMask ) ); // NaN -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// flush (denorm) -//--------------------------------------------------------------------------------------------------------------------------------- -static float tess_flush( float a ) -{ - static const int minNormalizedFloat = 0x00800000; - static const int signBit = 0x80000000; - static const int signBitComplement = 0x7fffffff; - int b = (*(int*)&a) & signBitComplement; // fabs() - if( b < minNormalizedFloat ) // UINT comparison. NaN/INF do test false here - { - b = signBit & (*(int*)&a); - return *(float*)&b; - } - return a; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// IEEE754R min -//--------------------------------------------------------------------------------------------------------------------------------- -static float tess_fmin( float a, float b ) -{ - float _a = tess_flush( a ); - float _b = tess_flush( b ); - if( tess_isNaN( _b ) ) - { - return a; - } - else if( ( _a == 0 ) && ( _b == 0 ) ) - { - return ( (*(int*)&_a) & 0x80000000 ) ? a : b; - } - return _a < _b ? a : b; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// IEEE754R max -//--------------------------------------------------------------------------------------------------------------------------------- -static float tess_fmax( float a, float b ) -{ - float _a = tess_flush( a ); - float _b = tess_flush( b ); - - if( tess_isNaN( _b ) ) - { - return a; - } - else if( ( _a == 0 ) && ( _b == 0 ) ) - { - return ( (*(int*)&_b) & 0x80000000 ) ? a : b; - } - return _a >= _b ? a : b; -} - -//================================================================================================================================= -// Fixed Point Math -//================================================================================================================================= - -//----------------------------------------------------------------------------------------------------------------------------- -// floatToFixedPoint -// -// Convert 32-bit float to 32-bit fixed point integer, using only -// integer arithmetic + bitwise operations. -// -// c_uIBits: UINT8 : Width of i (aka. integer bits) -// c_uFBits: UINT8 : Width of f (aka. fractional bits) -// c_bSigned: bool : Whether the integer bits are a 2's complement signed value -// input: float : All values valid. -// output: INT32 : At most 24 bits from LSB are meaningful, depending -// on the fixed point bit representation chosen (see -// below). Extra bits are sign extended from the most -// meaningful bit. -// -//----------------------------------------------------------------------------------------------------------------------------- - -typedef unsigned char UINT8; -typedef int INT32; -template< const UINT8 c_uIBits, const UINT8 c_uFBits, const bool c_bSigned > -INT32 floatToIDotF( const float& input ) -{ - // ------------------------------------------------------------------------ - // output fixed point format - // 32-bit result: - // - // [sign-extend]i.f - // | | - // MSB(31)...LSB(0) - // - // f fractional part of the number, an unsigned - // value with _fxpFracBitCount bits (defined below) - // - // . implied decimal - // - // i integer part of the number, a 2's complement - // value with _fxpIntBitCount bits (defined below) - // - // [sign-extend] MSB of i conditionally replicated - // - // ------------------------------------------------------------------------ - // Define fixed point bit counts - // - - // Commenting out C_ASSERT below to minimise #includes: - // C_ASSERT( 2 <= c_uIBits && c_uIBits <= 32 && c_uFBits <= 32 && c_uIBits + c_uFBits <= 32 ); - - // Define most negative and most positive fixed point values - const INT32 c_iMinResult = (c_bSigned ? INT32( -1 ) << (c_uIBits + c_uFBits - 1) : 0); - const INT32 c_iMaxResult = ~c_iMinResult; - - // ------------------------------------------------------------------------ - // constant float properties - // ------------------------------------------------------------------------ - const UINT8 _fltMantissaBitCount = 23; - const UINT8 _fltExponentBitCount = 8; - const INT32 _fltExponentBias = (INT32( 1 ) << (_fltExponentBitCount - 1)) - 1; - const INT32 _fltHiddenBit = INT32( 1 ) << _fltMantissaBitCount; - const INT32 _fltMantissaMask = _fltHiddenBit - 1; - const INT32 _fltExponentMask = ((INT32( 1 ) << _fltExponentBitCount) - 1) << _fltMantissaBitCount; - const INT32 _fltSignBit = INT32( 1 ) << (_fltExponentBitCount + _fltMantissaBitCount); - - // ------------------------------------------------------------------------ - // define min and max values as floats (clamp to these bounds) - // ------------------------------------------------------------------------ - INT32 _fxpMaxPosValueFloat; - INT32 _fxpMaxNegValueFloat; - - if (c_bSigned) - { - // The maximum positive fixed point value is 2^(i-1) - 2^(-f). - // The following constructs the floating point bit pattern for this value, - // as long as i >= 2. - _fxpMaxPosValueFloat = (_fltExponentBias + c_uIBits - 1) <<_fltMantissaBitCount; - const INT32 iShift = _fltMantissaBitCount + 2 - c_uIBits - c_uFBits; - if (iShift >= 0) - { -// assert( iShift < 32 ); -#if defined(_MSC_VER) -#pragma warning( suppress : 4293 ) -#endif - _fxpMaxPosValueFloat -= INT32( 1 ) << iShift; - } - - // The maximum negative fixed point value is -2^(i-1). - // The following constructs the floating point bit pattern for this value, - // as long as i >= 2. - // We need this number without the sign bit - _fxpMaxNegValueFloat = (_fltExponentBias + c_uIBits - 1) << _fltMantissaBitCount; - } - else - { - // The maximum positive fixed point value is 2^(i) - 2^(-f). - // The following constructs the floating point bit pattern for this value, - // as long as i >= 2. - _fxpMaxPosValueFloat = (_fltExponentBias + c_uIBits) <<_fltMantissaBitCount; - const INT32 iShift = _fltMantissaBitCount + 1 - c_uIBits - c_uFBits; - if (iShift >= 0) - { -// assert( iShift < 32 ); -#if defined(_MSC_VER) -#pragma warning( suppress : 4293 ) -#endif - _fxpMaxPosValueFloat -= INT32( 1 ) << iShift; - } - - // The maximum negative fixed point value is 0. - _fxpMaxNegValueFloat = 0; - } - - // ------------------------------------------------------------------------ - // float -> fixed conversion - // ------------------------------------------------------------------------ - - // ------------------------------------------------------------------------ - // examine input float - // ------------------------------------------------------------------------ - INT32 output = *(INT32*)&input; - INT32 unbiasedExponent = ((output & _fltExponentMask) >> _fltMantissaBitCount) - _fltExponentBias; - INT32 isNegative = output & _fltSignBit; - - // ------------------------------------------------------------------------ - // nan - // ------------------------------------------------------------------------ - if (unbiasedExponent == (_fltExponentBias + 1) && (output & _fltMantissaMask)) - { - // nan converts to 0 - output = 0; - } - // ------------------------------------------------------------------------ - // too large positive - // ------------------------------------------------------------------------ - else if (!isNegative && output >= _fxpMaxPosValueFloat) // integer compare - { - output = c_iMaxResult; - } - // ------------------------------------------------------------------------ - // too large negative - // ------------------------------------------------------------------------ - // integer compare - else if (isNegative && (output & ~_fltSignBit) >= _fxpMaxNegValueFloat) - { - output = c_iMinResult; - } - // ------------------------------------------------------------------------ - // too small - // ------------------------------------------------------------------------ - else if (unbiasedExponent < -c_uFBits - 1) - { - // clamp to 0 - output = 0; - } - // ------------------------------------------------------------------------ - // within range - // ------------------------------------------------------------------------ - else - { - // copy mantissa, add hidden bit - output = (output & _fltMantissaMask) | _fltHiddenBit; - - INT32 extraBits = _fltMantissaBitCount - c_uFBits - unbiasedExponent; - if (extraBits >= 0) - { - // 2's complement if negative - if (isNegative) - { - output = ~output + 1; - } - - // From the range checks that led here, it is known that - // unbiasedExponent < c_uIBits. So, at most: - // (a) unbiasedExponent == c_uIBits - 1. - // - // From compile validation above, it is known that - // c_uIBits + c_uFBits <= _fltMantissaBitCount + 1). - // So, at minimum: - // (b) _fltMantissaBitCount == _fxtIntBitCount + c_uFBits - 1 - // - // Substituting (a) and (b) into extraBits calculation above: - // extraBits >= (_fxtIntBitCount + c_uFBits - 1) - // - c_uFBits - (c_uIBits - 1) - // extraBits >= 0 - // - // Thus we only have to worry about shifting right by 0 or more - // bits to get the decimal to the right place, and never have - // to shift left. - - INT32 LSB = 1 << extraBits; // last bit being kept - INT32 extraBitsMask = LSB - 1; - INT32 half = LSB >> 1; // round bias - - // round to nearest-even at LSB - if ((output & LSB) || (output & extraBitsMask) > half) - { - output += half; - } - - // shift off the extra bits (sign extending) - output >>= extraBits; - } - else - { - output <<= -extraBits; - - // 2's complement if negative - if (isNegative) - { - output = ~output + 1; - } - } - } - return output; -} -//----------------------------------------------------------------------------------------------------------------------------- - -#define FXP_INTEGER_BITS 15 -#define FXP_FRACTION_BITS 16 -#define FXP_FRACTION_MASK 0x0000ffff -#define FXP_INTEGER_MASK 0x7fff0000 -#define FXP_THREE (3<<FXP_FRACTION_BITS) -#define FXP_ONE (1<<FXP_FRACTION_BITS) -#define FXP_ONE_THIRD 0x00005555 -#define FXP_TWO_THIRDS 0x0000aaaa -#define FXP_ONE_HALF 0x00008000 - -#define FXP_MAX_INPUT_TESS_FACTOR_BEFORE_TRIPLE_AVERAGE 0x55540000 // 1/3 of max fixed point number - 1. Numbers less than - // or equal to this allows avg. reduction on a tri patch - // including rounding. - -#define FXP_MAX_INPUT_TESS_FACTOR_BEFORE_PAIR_AVERAGE 0x7FFF0000 // 1/2 of max fixed point number - 1. Numbers less than - // or equal to this allows avg. reduction on a quad patch - // including rounding. - -static const FXP s_fixedReciprocal[D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR+1] = -{ - 0xffffffff, // 1/0 is the first entry (unused) - 0x10000, 0x8000, 0x5555, 0x4000, - 0x3333, 0x2aab, 0x2492, 0x2000, - 0x1c72, 0x199a, 0x1746, 0x1555, - 0x13b1, 0x1249, 0x1111, 0x1000, - 0xf0f, 0xe39, 0xd79, 0xccd, - 0xc31, 0xba3, 0xb21, 0xaab, - 0xa3d, 0x9d9, 0x97b, 0x925, - 0x8d4, 0x889, 0x842, 0x800, - 0x7c2, 0x788, 0x750, 0x71c, - 0x6eb, 0x6bd, 0x690, 0x666, - 0x63e, 0x618, 0x5f4, 0x5d1, - 0x5b0, 0x591, 0x572, 0x555, - 0x539, 0x51f, 0x505, 0x4ec, - 0x4d5, 0x4be, 0x4a8, 0x492, - 0x47e, 0x46a, 0x457, 0x444, - 0x432, 0x421, 0x410, 0x400, // 1/64 is the last entry -}; - -#define FLOAT_THREE 3.0f -#define FLOAT_ONE 1.0f - -//--------------------------------------------------------------------------------------------------------------------------------- -// floatToFixed -//--------------------------------------------------------------------------------------------------------------------------------- -FXP floatToFixed(const float& input) -{ - return floatToIDotF< FXP_INTEGER_BITS, FXP_FRACTION_BITS, /*bSigned*/false >( input ); -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// fixedToFloat -//--------------------------------------------------------------------------------------------------------------------------------- -float fixedToFloat(const FXP& input) -{ - // not worrying about denorm flushing the float operations (the DX spec behavior for div), since the numbers will not be that small during tessellation. - return ((float)(input>>FXP_FRACTION_BITS) + (float)(input&FXP_FRACTION_MASK)/(1<<FXP_FRACTION_BITS)); -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// isEven -//--------------------------------------------------------------------------------------------------------------------------------- -bool isEven(const float& input) -{ - return (((int)input) & 1) ? false : true; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// fxpCeil -//--------------------------------------------------------------------------------------------------------------------------------- -FXP fxpCeil(const FXP& input) -{ - if( input & FXP_FRACTION_MASK ) - { - return (input & FXP_INTEGER_MASK) + FXP_ONE; - } - return input; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// fxpFloor -//--------------------------------------------------------------------------------------------------------------------------------- -FXP fxpFloor(const FXP& input) -{ - return (input & FXP_INTEGER_MASK); -} - -//================================================================================================================================= -// CHWTessellator -//================================================================================================================================= - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::CHWTessellator -//--------------------------------------------------------------------------------------------------------------------------------- -CHWTessellator::CHWTessellator() -{ - m_Point = 0; - m_Index = 0; - m_NumPoints = 0; - m_NumIndices = 0; - m_bUsingPatchedIndices = false; - m_bUsingPatchedIndices2 = false; -#ifdef ALLOW_XBOX_360_COMPARISON - m_bXBox360Mode = false; -#endif -} -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::~CHWTessellator -//--------------------------------------------------------------------------------------------------------------------------------- -CHWTessellator::~CHWTessellator() -{ - delete [] m_Point; - delete [] m_Index; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::Init -// User calls this. -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::Init( - D3D11_TESSELLATOR_PARTITIONING partitioning, - D3D11_TESSELLATOR_OUTPUT_PRIMITIVE outputPrimitive) -{ - if( 0 == m_Point ) - { - m_Point = new DOMAIN_POINT[MAX_POINT_COUNT]; - } - if( 0 == m_Index ) - { - m_Index = new int[MAX_INDEX_COUNT]; - } - m_partitioning = partitioning; - m_originalPartitioning = partitioning; - switch( partitioning ) - { - case D3D11_TESSELLATOR_PARTITIONING_INTEGER: - default: - break; - case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: - m_parity = TESSELLATOR_PARITY_ODD; - break; - case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: - m_parity = TESSELLATOR_PARITY_EVEN; - break; - } - m_originalParity = m_parity; - m_outputPrimitive = outputPrimitive; - m_NumPoints = 0; - m_NumIndices = 0; -} -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::TessellateQuadDomain -// User calls this -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::TessellateQuadDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1, - float insideTessFactor_U, float insideTessFactor_V ) -{ - PROCESSED_TESS_FACTORS_QUAD processedTessFactors; - QuadProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Ueq1,tessFactor_Veq1,insideTessFactor_U,insideTessFactor_V,processedTessFactors); - - if( processedTessFactors.bPatchCulled ) - { - m_NumPoints = 0; - m_NumIndices = 0; - return; - } - else if( processedTessFactors.bJustDoMinimumTessFactor ) - { - DefinePoint(/*U*/0,/*V*/0,/*pointStorageOffset*/0); - DefinePoint(/*U*/FXP_ONE,/*V*/0,/*pointStorageOffset*/1); - DefinePoint(/*U*/FXP_ONE,/*V*/FXP_ONE,/*pointStorageOffset*/2); - DefinePoint(/*U*/0,/*V*/FXP_ONE,/*pointStorageOffset*/3); - m_NumPoints = 4; - - switch(m_outputPrimitive) - { - case D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW: - case D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW: - // function orients them CCW if needed - DefineClockwiseTriangle(0,1,3,/*indexStorageOffset*/0); - DefineClockwiseTriangle(1,2,3,/*indexStorageOffset*/3); - m_NumIndices = 6; - break; - case D3D11_TESSELLATOR_OUTPUT_POINT: - DumpAllPoints(); - break; - case D3D11_TESSELLATOR_OUTPUT_LINE: - DumpAllPointsAsInOrderLineList(); - break; - } - return; - } - - QuadGeneratePoints(processedTessFactors); - - if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT ) - { - DumpAllPoints(); - return; - } - if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_LINE ) - { - DumpAllPointsAsInOrderLineList(); - return; - } - - QuadGenerateConnectivity(processedTessFactors); // can be done in parallel to QuadGeneratePoints() -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::QuadProcessTessFactors -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::QuadProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1, - float insideTessFactor_U, float insideTessFactor_V, PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ) -{ - // Is the patch culled? - if( !(tessFactor_Ueq0 > 0) || // NaN will pass - !(tessFactor_Veq0 > 0) || - !(tessFactor_Ueq1 > 0) || - !(tessFactor_Veq1 > 0) ) - { - processedTessFactors.bPatchCulled = true; - return; - } - else - { - processedTessFactors.bPatchCulled = false; - } - - // Clamp edge TessFactors - float lowerBound = 0.0, upperBound = 0.0; - switch(m_originalPartitioning) - { - case D3D11_TESSELLATOR_PARTITIONING_INTEGER: - case D3D11_TESSELLATOR_PARTITIONING_POW2: // don�t care about pow2 distinction for validation, just treat as integer - lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; - upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; - break; - - case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: - lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR; - upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; - break; - - case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: - lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; - upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR; - break; - } - - tessFactor_Ueq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Ueq0 ) ); - tessFactor_Veq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Veq0 ) ); - tessFactor_Ueq1 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Ueq1 ) ); - tessFactor_Veq1 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Veq1 ) ); - - if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction) - { - tessFactor_Ueq0 = ceil(tessFactor_Ueq0); - tessFactor_Veq0 = ceil(tessFactor_Veq0); - tessFactor_Ueq1 = ceil(tessFactor_Ueq1); - tessFactor_Veq1 = ceil(tessFactor_Veq1); - } - - // Clamp inside TessFactors - if(D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD == m_originalPartitioning) - { -#define EPSILON 0.0000152587890625f // 2^(-16), min positive fixed point fraction -#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON (D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON/2) - // If any TessFactor will end up > 1 after floatToFixed conversion later, - // then force the inside TessFactors to be > 1 so there is a picture frame. - if( (tessFactor_Ueq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || - (tessFactor_Veq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || - (tessFactor_Ueq1 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || - (tessFactor_Veq1 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || - (insideTessFactor_U > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || - (insideTessFactor_V > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ) - { - // Force picture frame - lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON; - } - } - - insideTessFactor_U = tess_fmin( upperBound, tess_fmax( lowerBound, insideTessFactor_U ) ); - insideTessFactor_V = tess_fmin( upperBound, tess_fmax( lowerBound, insideTessFactor_V ) ); - // Note the above clamps map NaN to lowerBound - - - if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction) - { - insideTessFactor_U = ceil(insideTessFactor_U); - insideTessFactor_V = ceil(insideTessFactor_V); - } - - // Reset our vertex and index buffers. We have enough storage for the max tessFactor. - m_NumPoints = 0; - m_NumIndices = 0; - - // Process tessFactors - float outsideTessFactor[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Ueq1, tessFactor_Veq1}; - float insideTessFactor[QUAD_AXES] = {insideTessFactor_U,insideTessFactor_V}; - int edge, axis; - if( HWIntegerPartitioning() ) - { - for( edge = 0; edge < QUAD_EDGES; edge++ ) - { - int edgeEven = isEven(outsideTessFactor[edge]); - processedTessFactors.outsideTessFactorParity[edge] = edgeEven ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - for( axis = 0; axis < QUAD_AXES; axis++ ) - { - processedTessFactors.insideTessFactorParity[axis] = - (isEven(insideTessFactor[axis]) || (FLOAT_ONE == insideTessFactor[axis]) ) - ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - } - else - { - for( edge = 0; edge < QUAD_EDGES; edge++ ) - { - processedTessFactors.outsideTessFactorParity[edge] = m_originalParity; - } - processedTessFactors.insideTessFactorParity[U] = processedTessFactors.insideTessFactorParity[V] = m_originalParity; - } - - // Save fixed point TessFactors - for( edge = 0; edge < QUAD_EDGES; edge++ ) - { - processedTessFactors.outsideTessFactor[edge] = floatToFixed(outsideTessFactor[edge]); - } - for( axis = 0; axis < QUAD_AXES; axis++ ) - { - processedTessFactors.insideTessFactor[axis] = floatToFixed(insideTessFactor[axis]); - } - - if( HWIntegerPartitioning() || Odd() ) - { - // Special case if all TessFactors are 1 - if( (FXP_ONE == processedTessFactors.insideTessFactor[U]) && - (FXP_ONE == processedTessFactors.insideTessFactor[V]) && - (FXP_ONE == processedTessFactors.outsideTessFactor[Ueq0]) && - (FXP_ONE == processedTessFactors.outsideTessFactor[Veq0]) && - (FXP_ONE == processedTessFactors.outsideTessFactor[Ueq1]) && - (FXP_ONE == processedTessFactors.outsideTessFactor[Veq1]) ) - { - processedTessFactors.bJustDoMinimumTessFactor = true; - return; - } - } - processedTessFactors.bJustDoMinimumTessFactor = false; - - // Compute TessFactor-specific metadata - for(int edge = 0; edge < QUAD_EDGES; edge++ ) - { - SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); - ComputeTessFactorContext(processedTessFactors.outsideTessFactor[edge], processedTessFactors.outsideTessFactorCtx[edge]); - } - - for(int axis = 0; axis < QUAD_AXES; axis++) - { - SetTessellationParity(processedTessFactors.insideTessFactorParity[axis]); - ComputeTessFactorContext(processedTessFactors.insideTessFactor[axis], processedTessFactors.insideTessFactorCtx[axis]); - } - - // Compute some initial data. - - // outside edge offsets and storage - for(int edge = 0; edge < QUAD_EDGES; edge++ ) - { - SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); - processedTessFactors.numPointsForOutsideEdge[edge] = NumPointsForTessFactor(processedTessFactors.outsideTessFactor[edge]); - m_NumPoints += processedTessFactors.numPointsForOutsideEdge[edge]; - } - m_NumPoints -= 4; - - // inside edge offsets - for(int axis = 0; axis < QUAD_AXES; axis++) - { - SetTessellationParity(processedTessFactors.insideTessFactorParity[axis]); - processedTessFactors.numPointsForInsideTessFactor[axis] = NumPointsForTessFactor(processedTessFactors.insideTessFactor[axis]); - int pointCountMin = ( TESSELLATOR_PARITY_ODD == processedTessFactors.insideTessFactorParity[axis] ) ? 4 : 3; - // max() allows degenerate transition regions when inside TessFactor == 1 - processedTessFactors.numPointsForInsideTessFactor[axis] = max(pointCountMin,processedTessFactors.numPointsForInsideTessFactor[axis]); - } - - processedTessFactors.insideEdgePointBaseOffset = m_NumPoints; - - // inside storage, including interior edges above - int numInteriorPoints = (processedTessFactors.numPointsForInsideTessFactor[U] - 2)*(processedTessFactors.numPointsForInsideTessFactor[V]-2); - m_NumPoints += numInteriorPoints; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::QuadGeneratePoints -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::QuadGeneratePoints( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ) -{ - // Generate exterior ring edge points, clockwise from top-left - int pointOffset = 0; - int edge; - for(edge = 0; edge < QUAD_EDGES; edge++ ) - { - int parity = edge&0x1; - int startPoint = 0; - int endPoint = processedTessFactors.numPointsForOutsideEdge[edge] - 1; - for(int p = startPoint; p < endPoint; p++,pointOffset++) // don't include end, since next edge starts with it. - { - FXP fxpParam; - int q = ((edge==1)||(edge==2)) ? p : endPoint - p; // reverse order - SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); - PlacePointIn1D(processedTessFactors.outsideTessFactorCtx[edge],q,fxpParam); - if( parity ) - { - DefinePoint(/*U*/fxpParam, - /*V*/(edge == 3) ? FXP_ONE : 0, - /*pointStorageOffset*/pointOffset); - } - else - { - DefinePoint(/*U*/(edge == 2) ? FXP_ONE : 0, - /*V*/fxpParam, - /*pointStorageOffset*/pointOffset); - } - } - } - - // Generate interior ring points, clockwise from (U==0,V==1) (bottom-left) spiralling toward center - static const int startRing = 1; - int minNumPointsForTessFactor = min(processedTessFactors.numPointsForInsideTessFactor[U],processedTessFactors.numPointsForInsideTessFactor[V]); - int numRings = (minNumPointsForTessFactor >> 1); // note for even tess we aren't counting center point here. - for(int ring = startRing; ring < numRings; ring++) - { - int startPoint = ring; - int endPoint[QUAD_AXES] = {processedTessFactors.numPointsForInsideTessFactor[U] - 1 - startPoint, - processedTessFactors.numPointsForInsideTessFactor[V] - 1 - startPoint}; - - for(edge = 0; edge < QUAD_EDGES; edge++ ) - { - int parity[QUAD_AXES] = {edge&0x1,((edge+1)&0x1)}; - int perpendicularAxisPoint = (edge < 2) ? startPoint : endPoint[parity[0]]; - FXP fxpPerpParam; - SetTessellationParity(processedTessFactors.insideTessFactorParity[parity[0]]); - PlacePointIn1D(processedTessFactors.insideTessFactorCtx[parity[0]],perpendicularAxisPoint,fxpPerpParam); - SetTessellationParity(processedTessFactors.insideTessFactorParity[parity[1]]); - for(int p = startPoint; p < endPoint[parity[1]]; p++, pointOffset++) // don't include end: next edge starts with it. - { - FXP fxpParam; - int q = ((edge == 1)||(edge==2)) ? p : endPoint[parity[1]] - (p - startPoint); - PlacePointIn1D(processedTessFactors.insideTessFactorCtx[parity[1]],q,fxpParam); - if( parity[1] ) - { - DefinePoint(/*U*/fxpPerpParam, - /*V*/fxpParam, - /*pointStorageOffset*/pointOffset); - } - else - { - DefinePoint(/*U*/fxpParam, - /*V*/fxpPerpParam, - /*pointStorageOffset*/pointOffset); - } - } - } - } - // For even tessellation, the inner "ring" is degenerate - a row of points - if( (processedTessFactors.numPointsForInsideTessFactor[U] > processedTessFactors.numPointsForInsideTessFactor[V]) && - (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V]) ) - { - int startPoint = numRings; - int endPoint = processedTessFactors.numPointsForInsideTessFactor[U] - 1 - startPoint; - SetTessellationParity(processedTessFactors.insideTessFactorParity[U]); - for( int p = startPoint; p <= endPoint; p++, pointOffset++ ) - { - FXP fxpParam; - PlacePointIn1D(processedTessFactors.insideTessFactorCtx[U],p,fxpParam); - DefinePoint(/*U*/fxpParam, - /*V*/FXP_ONE_HALF, // middle - /*pointStorageOffset*/pointOffset); - } - } - else if( (processedTessFactors.numPointsForInsideTessFactor[V] >= processedTessFactors.numPointsForInsideTessFactor[U]) && - (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[U]) ) - { - int startPoint = numRings; - int endPoint; - FXP fxpParam; - endPoint = processedTessFactors.numPointsForInsideTessFactor[V] - 1 - startPoint; - SetTessellationParity(processedTessFactors.insideTessFactorParity[V]); - for( int p = endPoint; p >= startPoint; p--, pointOffset++ ) - { - PlacePointIn1D(processedTessFactors.insideTessFactorCtx[V],p,fxpParam); - DefinePoint(/*U*/FXP_ONE_HALF, // middle - /*V*/fxpParam, - /*pointStorageOffset*/pointOffset); - } - } -} -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::QuadGenerateConnectivity -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::QuadGenerateConnectivity( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ) -{ - // Generate primitives for all the concentric rings, one side at a time for each ring - static const int startRing = 1; - int numPointRowsToCenter[QUAD_AXES] = {((processedTessFactors.numPointsForInsideTessFactor[U]+1) >> 1), - ((processedTessFactors.numPointsForInsideTessFactor[V]+1) >> 1)}; // +1 is so even tess includes the center point - int numRings = min(numPointRowsToCenter[U],numPointRowsToCenter[V]); - int degeneratePointRing[QUAD_AXES] = { // Even partitioning causes degenerate row of points, - // which results in exceptions to the point ordering conventions - // when travelling around the rings counterclockwise. - (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V]) ? numPointRowsToCenter[V] - 1 : -1, - (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[U]) ? numPointRowsToCenter[U] - 1 : -1 }; - - const TESS_FACTOR_CONTEXT* outsideTessFactorCtx[QUAD_EDGES] = {&processedTessFactors.outsideTessFactorCtx[Ueq0], - &processedTessFactors.outsideTessFactorCtx[Veq0], - &processedTessFactors.outsideTessFactorCtx[Ueq1], - &processedTessFactors.outsideTessFactorCtx[Veq1]}; - TESSELLATOR_PARITY outsideTessFactorParity[QUAD_EDGES] = {processedTessFactors.outsideTessFactorParity[Ueq0], - processedTessFactors.outsideTessFactorParity[Veq0], - processedTessFactors.outsideTessFactorParity[Ueq1], - processedTessFactors.outsideTessFactorParity[Veq1]}; - int numPointsForOutsideEdge[QUAD_EDGES] = {processedTessFactors.numPointsForOutsideEdge[Ueq0], - processedTessFactors.numPointsForOutsideEdge[Veq0], - processedTessFactors.numPointsForOutsideEdge[Ueq1], - processedTessFactors.numPointsForOutsideEdge[Veq1]}; - - int insideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset; - int outsideEdgePointBaseOffset = 0; - int edge; - for(int ring = startRing; ring < numRings; ring++) - { - int numPointsForInsideEdge[QUAD_AXES] = {processedTessFactors.numPointsForInsideTessFactor[U] - 2*ring, - processedTessFactors.numPointsForInsideTessFactor[V] - 2*ring}; - - int edge0InsidePointBaseOffset = insideEdgePointBaseOffset; - int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset; - - for(edge = 0; edge < QUAD_EDGES; edge++ ) - { - int parity = (edge+1)&0x1; - - int numTriangles = numPointsForInsideEdge[parity] + numPointsForOutsideEdge[edge] - 2; - int insideBaseOffset; - int outsideBaseOffset; - if( edge == 3 ) // We need to patch the indexing so Stitch() can think it sees - // 2 sequentially increasing rows of points, even though we have wrapped around - // to the end of the inner and outer ring's points, so the last point is really - // the first point for the ring. - // We make it so that when Stitch() calls AddIndex(), that function - // will do any necessary index adjustment. - { - if( ring == degeneratePointRing[parity] ) - { - m_IndexPatchContext2.baseIndexToInvert = insideEdgePointBaseOffset + 1; - m_IndexPatchContext2.cornerCaseBadValue = outsideEdgePointBaseOffset + numPointsForOutsideEdge[edge] - 1; - m_IndexPatchContext2.cornerCaseReplacementValue = edge0OutsidePointBaseOffset; - m_IndexPatchContext2.indexInversionEndPoint = (m_IndexPatchContext2.baseIndexToInvert << 1) - 1; - insideBaseOffset = m_IndexPatchContext2.baseIndexToInvert; - outsideBaseOffset = outsideEdgePointBaseOffset; - SetUsingPatchedIndices2(true); - } - else - { - m_IndexPatchContext.insidePointIndexDeltaToRealValue = insideEdgePointBaseOffset; - m_IndexPatchContext.insidePointIndexBadValue = numPointsForInsideEdge[parity] - 1; - m_IndexPatchContext.insidePointIndexReplacementValue = edge0InsidePointBaseOffset; - m_IndexPatchContext.outsidePointIndexPatchBase = m_IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range - m_IndexPatchContext.outsidePointIndexDeltaToRealValue = outsideEdgePointBaseOffset - - m_IndexPatchContext.outsidePointIndexPatchBase; - m_IndexPatchContext.outsidePointIndexBadValue = m_IndexPatchContext.outsidePointIndexPatchBase - + numPointsForOutsideEdge[edge] - 1; - m_IndexPatchContext.outsidePointIndexReplacementValue = edge0OutsidePointBaseOffset; - - insideBaseOffset = 0; - outsideBaseOffset = m_IndexPatchContext.outsidePointIndexPatchBase; - SetUsingPatchedIndices(true); - } - } - else if( (edge == 2) && (ring == degeneratePointRing[parity]) ) - { - m_IndexPatchContext2.baseIndexToInvert = insideEdgePointBaseOffset; - m_IndexPatchContext2.cornerCaseBadValue = -1; // unused - m_IndexPatchContext2.cornerCaseReplacementValue = -1; // unused - m_IndexPatchContext2.indexInversionEndPoint = m_IndexPatchContext2.baseIndexToInvert << 1; - insideBaseOffset = m_IndexPatchContext2.baseIndexToInvert; - outsideBaseOffset = outsideEdgePointBaseOffset; - SetUsingPatchedIndices2(true); - } - else - { - insideBaseOffset = insideEdgePointBaseOffset; - outsideBaseOffset = outsideEdgePointBaseOffset; - } - if( ring == startRing ) - { - StitchTransition(/*baseIndexOffset: */m_NumIndices, - insideBaseOffset,processedTessFactors.insideTessFactorCtx[parity].numHalfTessFactorPoints,processedTessFactors.insideTessFactorParity[parity], - outsideBaseOffset,outsideTessFactorCtx[edge]->numHalfTessFactorPoints,outsideTessFactorParity[edge]); - } - else - { - StitchRegular(/*bTrapezoid*/true, DIAGONALS_MIRRORED, - /*baseIndexOffset: */m_NumIndices, - numPointsForInsideEdge[parity], - insideBaseOffset,outsideBaseOffset); - } - SetUsingPatchedIndices(false); - SetUsingPatchedIndices2(false); - m_NumIndices += numTriangles*3; - outsideEdgePointBaseOffset += numPointsForOutsideEdge[edge] - 1; - if( (edge == 2) && (ring == degeneratePointRing[parity]) ) - { - insideEdgePointBaseOffset -= numPointsForInsideEdge[parity] - 1; - } - else - { - insideEdgePointBaseOffset += numPointsForInsideEdge[parity] - 1; - } - numPointsForOutsideEdge[edge] = numPointsForInsideEdge[parity]; - } - if( startRing == ring ) - { - for(edge = 0; edge < QUAD_EDGES; edge++ ) - { - outsideTessFactorCtx[edge] = &processedTessFactors.insideTessFactorCtx[edge&1]; - outsideTessFactorParity[edge] = processedTessFactors.insideTessFactorParity[edge&1]; - } - } - } - - // Triangulate center - a row of quads if odd - // This triangulation may be producing diagonals that are asymmetric about - // the center of the patch in this region. - if( (processedTessFactors.numPointsForInsideTessFactor[U] > processedTessFactors.numPointsForInsideTessFactor[V]) && - (TESSELLATOR_PARITY_ODD == processedTessFactors.insideTessFactorParity[V] ) ) - { - SetUsingPatchedIndices2(true); - int stripNumQuads = (((processedTessFactors.numPointsForInsideTessFactor[U]>>1) - (processedTessFactors.numPointsForInsideTessFactor[V]>>1))<<1)+ - ((TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[U] ) ? 2 : 1); - m_IndexPatchContext2.baseIndexToInvert = outsideEdgePointBaseOffset + stripNumQuads + 2; - m_IndexPatchContext2.cornerCaseBadValue = m_IndexPatchContext2.baseIndexToInvert; - m_IndexPatchContext2.cornerCaseReplacementValue = outsideEdgePointBaseOffset; - m_IndexPatchContext2.indexInversionEndPoint = m_IndexPatchContext2.baseIndexToInvert + - m_IndexPatchContext2.baseIndexToInvert + stripNumQuads; - StitchRegular(/*bTrapezoid*/false,DIAGONALS_INSIDE_TO_OUTSIDE, - /*baseIndexOffset: */m_NumIndices, /*numInsideEdgePoints:*/stripNumQuads+1, - /*insideEdgePointBaseOffset*/m_IndexPatchContext2.baseIndexToInvert, - outsideEdgePointBaseOffset+1); - SetUsingPatchedIndices2(false); - m_NumIndices += stripNumQuads*6; - } - else if((processedTessFactors.numPointsForInsideTessFactor[V] >= processedTessFactors.numPointsForInsideTessFactor[U]) && - (TESSELLATOR_PARITY_ODD == processedTessFactors.insideTessFactorParity[U]) ) - { - SetUsingPatchedIndices2(true); - int stripNumQuads = (((processedTessFactors.numPointsForInsideTessFactor[V]>>1) - (processedTessFactors.numPointsForInsideTessFactor[U]>>1))<<1)+ - ((TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V] ) ? 2 : 1); - m_IndexPatchContext2.baseIndexToInvert = outsideEdgePointBaseOffset + stripNumQuads + 1; - m_IndexPatchContext2.cornerCaseBadValue = -1; // unused - m_IndexPatchContext2.indexInversionEndPoint = m_IndexPatchContext2.baseIndexToInvert + - m_IndexPatchContext2.baseIndexToInvert + stripNumQuads; - DIAGONALS diag = (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V]) ? - DIAGONALS_INSIDE_TO_OUTSIDE : DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE; - StitchRegular(/*bTrapezoid*/false,diag, - /*baseIndexOffset: */m_NumIndices, /*numInsideEdgePoints:*/stripNumQuads+1, - /*insideEdgePointBaseOffset*/m_IndexPatchContext2.baseIndexToInvert, - outsideEdgePointBaseOffset); - SetUsingPatchedIndices2(false); - m_NumIndices += stripNumQuads*6; - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::TessellateTriDomain -// User calls this -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::TessellateTriDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0, - float insideTessFactor ) -{ - PROCESSED_TESS_FACTORS_TRI processedTessFactors; - TriProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Weq0,insideTessFactor,processedTessFactors); - - if( processedTessFactors.bPatchCulled ) - { - m_NumPoints = 0; - m_NumIndices = 0; - return; - } - else if( processedTessFactors.bJustDoMinimumTessFactor ) - { - DefinePoint(/*U*/0,/*V*/FXP_ONE,/*pointStorageOffset*/0); //V=1 (beginning of Ueq0 edge VW) - DefinePoint(/*U*/0,/*V*/0,/*pointStorageOffset*/1); //W=1 (beginning of Veq0 edge WU) - DefinePoint(/*U*/FXP_ONE,/*V*/0,/*pointStorageOffset*/2); //U=1 (beginning of Weq0 edge UV) - m_NumPoints = 3; - - switch(m_outputPrimitive) - { - case D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW: - case D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW: - // function orients them CCW if needed - DefineClockwiseTriangle(0,1,2,/*indexStorageBaseOffset*/m_NumIndices); - m_NumIndices = 3; - break; - case D3D11_TESSELLATOR_OUTPUT_POINT: - DumpAllPoints(); - break; - case D3D11_TESSELLATOR_OUTPUT_LINE: - DumpAllPointsAsInOrderLineList(); - break; - } - return; - } - - TriGeneratePoints(processedTessFactors); - - if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT ) - { - DumpAllPoints(); - return; - } - if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_LINE ) - { - DumpAllPointsAsInOrderLineList(); - return; - } - - TriGenerateConnectivity(processedTessFactors); // can be done in parallel to TriGeneratePoints() -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::TriProcessTessFactors -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::TriProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0, - float insideTessFactor, PROCESSED_TESS_FACTORS_TRI& processedTessFactors ) -{ - // Is the patch culled? - if( !(tessFactor_Ueq0 > 0) || // NaN will pass - !(tessFactor_Veq0 > 0) || - !(tessFactor_Weq0 > 0) ) - { - processedTessFactors.bPatchCulled = true; - return; - } - else - { - processedTessFactors.bPatchCulled = false; - } - - // Clamp edge TessFactors - float lowerBound = 0.0, upperBound = 0.0; - switch(m_originalPartitioning) - { - case D3D11_TESSELLATOR_PARTITIONING_INTEGER: - case D3D11_TESSELLATOR_PARTITIONING_POW2: // don�t care about pow2 distinction for validation, just treat as integer - lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; - upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; - break; - - case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: - lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR; - upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; - break; - - case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: - lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; - upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR; - break; - } - - tessFactor_Ueq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Ueq0 ) ); - tessFactor_Veq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Veq0 ) ); - tessFactor_Weq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Weq0 ) ); - - if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction) - { - tessFactor_Ueq0 = ceil(tessFactor_Ueq0); - tessFactor_Veq0 = ceil(tessFactor_Veq0); - tessFactor_Weq0 = ceil(tessFactor_Weq0); - } - - // Clamp inside TessFactors - if(D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD == m_originalPartitioning) - { - if( (tessFactor_Ueq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || - (tessFactor_Veq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || - (tessFactor_Weq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON)) - // Don't need the same check for insideTessFactor for tri patches, - // since there is only one insideTessFactor, as opposed to quad - // patches which have 2 insideTessFactors. - { - // Force picture frame - lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON; - } - } - - insideTessFactor = tess_fmin( upperBound, tess_fmax( lowerBound, insideTessFactor ) ); - // Note the above clamps map NaN to lowerBound - - if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction) - { - insideTessFactor = ceil(insideTessFactor); - } - - // Reset our vertex and index buffers. We have enough storage for the max tessFactor. - m_NumPoints = 0; - m_NumIndices = 0; - - // Process tessFactors - float outsideTessFactor[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Weq0}; - int edge; - if( HWIntegerPartitioning() ) - { - for( edge = 0; edge < TRI_EDGES; edge++ ) - { - int edgeEven = isEven(outsideTessFactor[edge]); - processedTessFactors.outsideTessFactorParity[edge] = edgeEven ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - processedTessFactors.insideTessFactorParity = (isEven(insideTessFactor) || (FLOAT_ONE == insideTessFactor)) - ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - else - { - for( edge = 0; edge < TRI_EDGES; edge++ ) - { - processedTessFactors.outsideTessFactorParity[edge] = m_originalParity; - } - processedTessFactors.insideTessFactorParity = m_originalParity; - } - - // Save fixed point TessFactors - for( edge = 0; edge < TRI_EDGES; edge++ ) - { - processedTessFactors.outsideTessFactor[edge] = floatToFixed(outsideTessFactor[edge]); - } - processedTessFactors.insideTessFactor = floatToFixed(insideTessFactor); - - if( HWIntegerPartitioning() || Odd() ) - { - // Special case if all TessFactors are 1 - if( (FXP_ONE == processedTessFactors.insideTessFactor) && - (FXP_ONE == processedTessFactors.outsideTessFactor[Ueq0]) && - (FXP_ONE == processedTessFactors.outsideTessFactor[Veq0]) && - (FXP_ONE == processedTessFactors.outsideTessFactor[Weq0]) ) - { - processedTessFactors.bJustDoMinimumTessFactor = true; - return; - } - } - processedTessFactors.bJustDoMinimumTessFactor = false; - - // Compute per-TessFactor metadata - for(edge = 0; edge < TRI_EDGES; edge++ ) - { - SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); - ComputeTessFactorContext(processedTessFactors.outsideTessFactor[edge], processedTessFactors.outsideTessFactorCtx[edge]); - } - SetTessellationParity(processedTessFactors.insideTessFactorParity); - ComputeTessFactorContext(processedTessFactors.insideTessFactor, processedTessFactors.insideTessFactorCtx); - - // Compute some initial data. - - // outside edge offsets and storage - for(edge = 0; edge < TRI_EDGES; edge++ ) - { - SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); - processedTessFactors.numPointsForOutsideEdge[edge] = NumPointsForTessFactor(processedTessFactors.outsideTessFactor[edge]); - m_NumPoints += processedTessFactors.numPointsForOutsideEdge[edge]; - } - m_NumPoints -= 3; - - // inside edge offsets - SetTessellationParity(processedTessFactors.insideTessFactorParity); - processedTessFactors.numPointsForInsideTessFactor = NumPointsForTessFactor(processedTessFactors.insideTessFactor); - { - int pointCountMin = Odd() ? 4 : 3; - // max() allows degenerate transition regions when inside TessFactor == 1 - processedTessFactors.numPointsForInsideTessFactor = max(pointCountMin,processedTessFactors.numPointsForInsideTessFactor); - } - - processedTessFactors.insideEdgePointBaseOffset = m_NumPoints; - - // inside storage, including interior edges above - { - int numInteriorRings = (processedTessFactors.numPointsForInsideTessFactor >> 1) - 1; - int numInteriorPoints; - if( Odd() ) - { - numInteriorPoints = TRI_EDGES*(numInteriorRings*(numInteriorRings+1) - numInteriorRings); - } - else - { - numInteriorPoints = TRI_EDGES*(numInteriorRings*(numInteriorRings+1)) + 1; - } - m_NumPoints += numInteriorPoints; - } - -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::TriGeneratePoints -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::TriGeneratePoints( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors ) -{ - // Generate exterior ring edge points, clockwise starting from point V (VW, the U==0 edge) - int pointOffset = 0; - int edge; - for(edge = 0; edge < TRI_EDGES; edge++ ) - { - int parity = edge&0x1; - int startPoint = 0; - int endPoint = processedTessFactors.numPointsForOutsideEdge[edge] - 1; - for(int p = startPoint; p < endPoint; p++, pointOffset++) // don't include end, since next edge starts with it. - { - FXP fxpParam; - int q = (parity) ? p : endPoint - p; // whether to reverse point order given we are defining V or U (W implicit): - // edge0, VW, has V decreasing, so reverse 1D points below - // edge1, WU, has U increasing, so don't reverse 1D points below - // edge2, UV, has U decreasing, so reverse 1D points below - SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); - PlacePointIn1D(processedTessFactors.outsideTessFactorCtx[edge],q,fxpParam); - if( edge == 0 ) - { - DefinePoint(/*U*/0, - /*V*/fxpParam, - /*pointStorageOffset*/pointOffset); - } - else - { - DefinePoint(/*U*/fxpParam, - /*V*/(edge == 2) ? FXP_ONE - fxpParam : 0, - /*pointStorageOffset*/pointOffset); - } - } - } - - // Generate interior ring points, clockwise spiralling in - SetTessellationParity(processedTessFactors.insideTessFactorParity); - static const int startRing = 1; - int numRings = (processedTessFactors.numPointsForInsideTessFactor >> 1); - for(int ring = startRing; ring < numRings; ring++) - { - int startPoint = ring; - int endPoint = processedTessFactors.numPointsForInsideTessFactor - 1 - startPoint; - - for(edge = 0; edge < TRI_EDGES; edge++ ) - { - int parity = edge&0x1; - int perpendicularAxisPoint = startPoint; - FXP fxpPerpParam; - PlacePointIn1D(processedTessFactors.insideTessFactorCtx,perpendicularAxisPoint,fxpPerpParam); - fxpPerpParam *= FXP_TWO_THIRDS; // Map location to the right size in barycentric space. - // I (amarp) can draw a picture to explain. - // We know this fixed point math won't over/underflow - fxpPerpParam = (fxpPerpParam+FXP_ONE_HALF/*round*/)>>FXP_FRACTION_BITS; // get back to n.16 - for(int p = startPoint; p < endPoint; p++, pointOffset++) // don't include end: next edge starts with it. - { - FXP fxpParam; - int q = (parity) ? p : endPoint - (p - startPoint); // whether to reverse point given we are defining V or U (W implicit): - // edge0, VW, has V decreasing, so reverse 1D points below - // edge1, WU, has U increasing, so don't reverse 1D points below - // edge2, UV, has U decreasing, so reverse 1D points below - PlacePointIn1D(processedTessFactors.insideTessFactorCtx,q,fxpParam); - // edge0 VW, has perpendicular parameter U constant - // edge1 WU, has perpendicular parameter V constant - // edge2 UV, has perpendicular parameter W constant - const unsigned int deriv = 2; // reciprocal is the rate of change of edge-parallel parameters as they are pushed into the triangle - switch(edge) - { - case 0: - DefinePoint(/*U*/fxpPerpParam, - /*V*/fxpParam - (fxpPerpParam+1/*round*/)/deriv, // we know this fixed point math won't over/underflow - /*pointStorageOffset*/pointOffset); - break; - case 1: - DefinePoint(/*U*/fxpParam - (fxpPerpParam+1/*round*/)/deriv,// we know this fixed point math won't over/underflow - /*V*/fxpPerpParam, - /*pointStorageOffset*/pointOffset); - break; - case 2: - DefinePoint(/*U*/fxpParam - (fxpPerpParam+1/*round*/)/deriv,// we know this fixed point math won't over/underflow - /*V*/FXP_ONE - (fxpParam - (fxpPerpParam+1/*round*/)/deriv) - fxpPerpParam,// we know this fixed point math won't over/underflow - /*pointStorageOffset*/pointOffset); - break; - } - } - } - } - if( !Odd() ) - { - // Last point is the point at the center. - DefinePoint(/*U*/FXP_ONE_THIRD, - /*V*/FXP_ONE_THIRD, - /*pointStorageOffset*/pointOffset); - } -} -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::TriGenerateConnectivity -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::TriGenerateConnectivity( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors ) -{ - // Generate primitives for all the concentric rings, one side at a time for each ring - static const int startRing = 1; - int numRings = ((processedTessFactors.numPointsForInsideTessFactor+1) >> 1); // +1 is so even tess includes the center point, which we want to now - const TESS_FACTOR_CONTEXT* outsideTessFactorCtx[TRI_EDGES] = {&processedTessFactors.outsideTessFactorCtx[Ueq0], - &processedTessFactors.outsideTessFactorCtx[Veq0], - &processedTessFactors.outsideTessFactorCtx[Weq0]}; - TESSELLATOR_PARITY outsideTessFactorParity[TRI_EDGES] = {processedTessFactors.outsideTessFactorParity[Ueq0], - processedTessFactors.outsideTessFactorParity[Veq0], - processedTessFactors.outsideTessFactorParity[Weq0]}; - int numPointsForOutsideEdge[TRI_EDGES] = {processedTessFactors.numPointsForOutsideEdge[Ueq0], - processedTessFactors.numPointsForOutsideEdge[Veq0], - processedTessFactors.numPointsForOutsideEdge[Weq0]}; - - int insideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset; - int outsideEdgePointBaseOffset = 0; - int edge; - for(int ring = startRing; ring < numRings; ring++) - { - int numPointsForInsideEdge = processedTessFactors.numPointsForInsideTessFactor - 2*ring; - int edge0InsidePointBaseOffset = insideEdgePointBaseOffset; - int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset; - for(edge = 0; edge < TRI_EDGES; edge++ ) - { - int numTriangles = numPointsForInsideEdge + numPointsForOutsideEdge[edge] - 2; - - int insideBaseOffset; - int outsideBaseOffset; - if( edge == 2 ) - { - m_IndexPatchContext.insidePointIndexDeltaToRealValue = insideEdgePointBaseOffset; - m_IndexPatchContext.insidePointIndexBadValue = numPointsForInsideEdge - 1; - m_IndexPatchContext.insidePointIndexReplacementValue = edge0InsidePointBaseOffset; - m_IndexPatchContext.outsidePointIndexPatchBase = m_IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range - m_IndexPatchContext.outsidePointIndexDeltaToRealValue = outsideEdgePointBaseOffset - - m_IndexPatchContext.outsidePointIndexPatchBase; - m_IndexPatchContext.outsidePointIndexBadValue = m_IndexPatchContext.outsidePointIndexPatchBase - + numPointsForOutsideEdge[edge] - 1; - m_IndexPatchContext.outsidePointIndexReplacementValue = edge0OutsidePointBaseOffset; - SetUsingPatchedIndices(true); - insideBaseOffset = 0; - outsideBaseOffset = m_IndexPatchContext.outsidePointIndexPatchBase; - } - else - { - insideBaseOffset = insideEdgePointBaseOffset; - outsideBaseOffset = outsideEdgePointBaseOffset; - } - if( ring == startRing ) - { - StitchTransition(/*baseIndexOffset: */m_NumIndices, - insideBaseOffset,processedTessFactors.insideTessFactorCtx.numHalfTessFactorPoints,processedTessFactors.insideTessFactorParity, - outsideBaseOffset,outsideTessFactorCtx[edge]->numHalfTessFactorPoints,outsideTessFactorParity[edge]); - } - else - { - StitchRegular(/*bTrapezoid*/true, DIAGONALS_MIRRORED, - /*baseIndexOffset: */m_NumIndices, - numPointsForInsideEdge, - insideBaseOffset,outsideBaseOffset); - } - if( 2 == edge ) - { - SetUsingPatchedIndices(false); - } - m_NumIndices += numTriangles*3; - outsideEdgePointBaseOffset += numPointsForOutsideEdge[edge] - 1; - insideEdgePointBaseOffset += numPointsForInsideEdge - 1; - numPointsForOutsideEdge[edge] = numPointsForInsideEdge; - } - if( startRing == ring ) - { - for(edge = 0; edge < TRI_EDGES; edge++ ) - { - outsideTessFactorCtx[edge] = &processedTessFactors.insideTessFactorCtx; - outsideTessFactorParity[edge] = processedTessFactors.insideTessFactorParity; - } - } - } - if( Odd() ) - { - // Triangulate center (a single triangle) - DefineClockwiseTriangle(outsideEdgePointBaseOffset, outsideEdgePointBaseOffset+1, outsideEdgePointBaseOffset+2, - m_NumIndices); - m_NumIndices += 3; - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::TessellateIsoLineDomain -// User calls this. -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::TessellateIsoLineDomain( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail ) -{ - PROCESSED_TESS_FACTORS_ISOLINE processedTessFactors; - IsoLineProcessTessFactors(TessFactor_V_LineDensity,TessFactor_U_LineDetail,processedTessFactors); - if( processedTessFactors.bPatchCulled ) - { - m_NumPoints = 0; - m_NumIndices = 0; - return; - } - IsoLineGeneratePoints(processedTessFactors); - IsoLineGenerateConnectivity(processedTessFactors); // can be done in parallel to IsoLineGeneratePoints -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::IsoLineProcessTessFactors -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::IsoLineProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail, - PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ) -{ - // Is the patch culled? - if( !(TessFactor_V_LineDensity > 0) || // NaN will pass - !(TessFactor_U_LineDetail > 0) ) - { - processedTessFactors.bPatchCulled = true; - return; - } - else - { - processedTessFactors.bPatchCulled = false; - } - - // Clamp edge TessFactors - float lowerBound = 0.0, upperBound = 0.0; - switch(m_originalPartitioning) - { - case D3D11_TESSELLATOR_PARTITIONING_INTEGER: - case D3D11_TESSELLATOR_PARTITIONING_POW2: // don�t care about pow2 distinction for validation, just treat as integer - lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; - upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; - break; - - case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: - lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR; - upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; - break; - - case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: - lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; - upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR; - break; - } - - TessFactor_V_LineDensity = tess_fmin( D3D11_TESSELLATOR_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR, - tess_fmax( D3D11_TESSELLATOR_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR, TessFactor_V_LineDensity ) ); - TessFactor_U_LineDetail = tess_fmin( upperBound, tess_fmax( lowerBound, TessFactor_U_LineDetail ) ); - - // Reset our vertex and index buffers. We have enough storage for the max tessFactor. - m_NumPoints = 0; - m_NumIndices = 0; - - // Process tessFactors - if( HWIntegerPartitioning() ) - { - TessFactor_U_LineDetail = ceil(TessFactor_U_LineDetail); - processedTessFactors.lineDetailParity = isEven(TessFactor_U_LineDetail) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - else - { - processedTessFactors.lineDetailParity = m_originalParity; - } - - FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail); - - SetTessellationParity(processedTessFactors.lineDetailParity); - - ComputeTessFactorContext(fxpTessFactor_U_LineDetail, processedTessFactors.lineDetailTessFactorCtx); - processedTessFactors.numPointsPerLine = NumPointsForTessFactor(fxpTessFactor_U_LineDetail); - - OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING_INTEGER); - - TessFactor_V_LineDensity = ceil(TessFactor_V_LineDensity); - processedTessFactors.lineDensityParity = isEven(TessFactor_V_LineDensity) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - SetTessellationParity(processedTessFactors.lineDensityParity); - FXP fxpTessFactor_V_LineDensity = floatToFixed(TessFactor_V_LineDensity); - ComputeTessFactorContext(fxpTessFactor_V_LineDensity, processedTessFactors.lineDensityTessFactorCtx); - - processedTessFactors.numLines = NumPointsForTessFactor(fxpTessFactor_V_LineDensity) - 1; // don't draw last line at V == 1. - - RestorePartitioning(); - - // Compute some initial data. - - // outside edge offsets - m_NumPoints = processedTessFactors.numPointsPerLine * processedTessFactors.numLines; - if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT ) - { - m_NumIndices = m_NumPoints; - } - else // line - { - m_NumIndices = processedTessFactors.numLines*(processedTessFactors.numPointsPerLine-1)*2; - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::IsoLineGeneratePoints -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::IsoLineGeneratePoints( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ) -{ - int line, pointOffset; - for(line = 0, pointOffset = 0; line < processedTessFactors.numLines; line++) - { - for(int point = 0; point < processedTessFactors.numPointsPerLine; point++) - { - FXP fxpU,fxpV; - SetTessellationParity(processedTessFactors.lineDensityParity); - PlacePointIn1D(processedTessFactors.lineDensityTessFactorCtx,line,fxpV); - - SetTessellationParity(processedTessFactors.lineDetailParity); - PlacePointIn1D(processedTessFactors.lineDetailTessFactorCtx,point,fxpU); - - DefinePoint(fxpU,fxpV,pointOffset++); - } - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::IsoLineGenerateConnectivity -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::IsoLineGenerateConnectivity( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ) -{ - int line, pointOffset, indexOffset; - if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT ) - { - for(line = 0, pointOffset = 0, indexOffset = 0; line < processedTessFactors.numLines; line++) - { - for(int point = 0; point < processedTessFactors.numPointsPerLine; point++) - { - DefineIndex(pointOffset++,indexOffset++); - } - } - } - else // line - { - for(line = 0, pointOffset = 0, indexOffset = 0; line < processedTessFactors.numLines; line++) - { - for(int point = 0; point < processedTessFactors.numPointsPerLine; point++) - { - if( point > 0 ) - { - DefineIndex(pointOffset-1,indexOffset++); - DefineIndex(pointOffset,indexOffset++); - } - pointOffset++; - } - } - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::GetPointCount -// User calls this. -//--------------------------------------------------------------------------------------------------------------------------------- -int CHWTessellator::GetPointCount() -{ - return m_NumPoints; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::GetIndexCount() -// User calls this. -//--------------------------------------------------------------------------------------------------------------------------------- -int CHWTessellator::GetIndexCount() -{ - return m_NumIndices; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::GetPoints() -// User calls this. -//--------------------------------------------------------------------------------------------------------------------------------- -DOMAIN_POINT* CHWTessellator::GetPoints() -{ - return m_Point; -} -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::GetIndices() -// User calls this. -//--------------------------------------------------------------------------------------------------------------------------------- -int* CHWTessellator::GetIndices() -{ - return m_Index; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::DefinePoint() -//--------------------------------------------------------------------------------------------------------------------------------- -int CHWTessellator::DefinePoint(FXP fxpU, FXP fxpV, int pointStorageOffset) -{ -// WCHAR foo[80]; -// StringCchPrintf(foo,80,L"off:%d, uv=(%f,%f)\n",pointStorageOffset,fixedToFloat(fxpU),fixedToFloat(fxpV)); -// OutputDebugString(foo); - m_Point[pointStorageOffset].u = fixedToFloat(fxpU); - m_Point[pointStorageOffset].v = fixedToFloat(fxpV); - return pointStorageOffset; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::DefineIndex() -//-------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::DefineIndex(int index, int indexStorageOffset) -{ - index = PatchIndexValue(index); -// WCHAR foo[80]; -// StringCchPrintf(foo,80,L"off:%d, idx=%d, uv=(%f,%f)\n",indexStorageOffset,index,m_Point[index].u,m_Point[index].v); -// OutputDebugString(foo); - m_Index[indexStorageOffset] = index; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::DefineClockwiseTriangle() -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::DefineClockwiseTriangle(int index0, int index1, int index2, int indexStorageBaseOffset) -{ - // inputs a clockwise triangle, stores a CW or CCW triangle depending on the state - DefineIndex(index0,indexStorageBaseOffset); - bool bWantClockwise = (m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW) ? true : false; - if( bWantClockwise ) - { - DefineIndex(index1,indexStorageBaseOffset+1); - DefineIndex(index2,indexStorageBaseOffset+2); - } - else - { - DefineIndex(index2,indexStorageBaseOffset+1); - DefineIndex(index1,indexStorageBaseOffset+2); - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::DumpAllPoints() -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::DumpAllPoints() -{ - for( int p = 0; p < m_NumPoints; p++ ) - { - DefineIndex(p,m_NumIndices++); - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::DumpAllPointsAsInOrderLineList() -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::DumpAllPointsAsInOrderLineList() -{ - for( int p = 1; p < m_NumPoints; p++ ) - { - DefineIndex(p-1,m_NumIndices++); - DefineIndex(p,m_NumIndices++); - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// RemoveMSB -//--------------------------------------------------------------------------------------------------------------------------------- -int RemoveMSB(int val) -{ - int check; - if( val <= 0x0000ffff ) { check = ( val <= 0x000000ff ) ? 0x00000080 : 0x00008000; } - else { check = ( val <= 0x00ffffff ) ? 0x00800000 : 0x80000000; } - for( int i = 0; i < 8; i++, check >>= 1 ) { if( val & check ) return (val & ~check); } - return 0; -} -//--------------------------------------------------------------------------------------------------------------------------------- -// GetMSB -//--------------------------------------------------------------------------------------------------------------------------------- -int GetMSB(int val) -{ - int check; - if( val <= 0x0000ffff ) { check = ( val <= 0x000000ff ) ? 0x00000080 : 0x00008000; } - else { check = ( val <= 0x00ffffff ) ? 0x00800000 : 0x80000000; } - for( int i = 0; i < 8; i++, check >>= 1 ) { if( val & check ) return check; } - return 0; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::CleanseParameter() -//--------------------------------------------------------------------------------------------------------------------------------- -/* NOTHING TO DO FOR FIXED POINT ARITHMETIC! -void CHWTessellator::CleanseParameter(float& parameter) -{ - // Clean up [0..1] parameter to guarantee that (1 - (1 - parameter)) == parameter. - parameter = 1.0f - parameter; - parameter = 1.0f - parameter; - -} -*/ -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::NumPointsForTessFactor() -//--------------------------------------------------------------------------------------------------------------------------------- -int CHWTessellator::NumPointsForTessFactor( FXP fxpTessFactor ) -{ - int numPoints; - if( Odd() ) - { - numPoints = (fxpCeil(FXP_ONE_HALF + (fxpTessFactor+1/*round*/)/2)*2)>>FXP_FRACTION_BITS; - } - else - { - numPoints = ((fxpCeil((fxpTessFactor+1/*round*/)/2)*2)>>FXP_FRACTION_BITS)+1; - } - return numPoints; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::ComputeTessFactorContext() -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::ComputeTessFactorContext( FXP fxpTessFactor, TESS_FACTOR_CONTEXT& TessFactorCtx ) -{ - FXP fxpHalfTessFactor = (fxpTessFactor+1/*round*/)/2; - if( Odd() || (fxpHalfTessFactor == FXP_ONE_HALF)) // fxpHalfTessFactor == 1/2 if TessFactor is 1, but we're pretending we are even. - { - fxpHalfTessFactor += FXP_ONE_HALF; - } - FXP fxpFloorHalfTessFactor = fxpFloor(fxpHalfTessFactor); - FXP fxpCeilHalfTessFactor = fxpCeil(fxpHalfTessFactor); - TessFactorCtx.fxpHalfTessFactorFraction = fxpHalfTessFactor - fxpFloorHalfTessFactor; - //CleanseParameter(TessFactorCtx.fxpHalfTessFactorFraction); - TessFactorCtx.numHalfTessFactorPoints = (fxpCeilHalfTessFactor>>FXP_FRACTION_BITS); // for EVEN, we don't include the point always fixed at the midpoint of the TessFactor - if( fxpCeilHalfTessFactor == fxpFloorHalfTessFactor ) - { - TessFactorCtx.splitPointOnFloorHalfTessFactor = /*pick value to cause this to be ignored*/ TessFactorCtx.numHalfTessFactorPoints+1; - } - else if( Odd() ) - { - if( fxpFloorHalfTessFactor == FXP_ONE ) - { - TessFactorCtx.splitPointOnFloorHalfTessFactor = 0; - } - else - { -#ifdef ALLOW_XBOX_360_COMPARISON - if( m_bXBox360Mode ) - TessFactorCtx.splitPointOnFloorHalfTessFactor = TessFactorCtx.numHalfTessFactorPoints-2; - else -#endif - TessFactorCtx.splitPointOnFloorHalfTessFactor = (RemoveMSB((fxpFloorHalfTessFactor>>FXP_FRACTION_BITS)-1)<<1) + 1; - } - } - else - { -#ifdef ALLOW_XBOX_360_COMPARISON - if( m_bXBox360Mode ) - TessFactorCtx.splitPointOnFloorHalfTessFactor = TessFactorCtx.numHalfTessFactorPoints-1; - else -#endif - TessFactorCtx.splitPointOnFloorHalfTessFactor = (RemoveMSB(fxpFloorHalfTessFactor>>FXP_FRACTION_BITS)<<1) + 1; - } - int numFloorSegments = (fxpFloorHalfTessFactor * 2)>>FXP_FRACTION_BITS; - int numCeilSegments = (fxpCeilHalfTessFactor * 2)>>FXP_FRACTION_BITS; - if( Odd() ) - { - numFloorSegments -= 1; - numCeilSegments -= 1; - } - TessFactorCtx.fxpInvNumSegmentsOnFloorTessFactor = s_fixedReciprocal[numFloorSegments]; - TessFactorCtx.fxpInvNumSegmentsOnCeilTessFactor = s_fixedReciprocal[numCeilSegments]; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::PlacePointIn1D() -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::PlacePointIn1D( const TESS_FACTOR_CONTEXT& TessFactorCtx, int point, FXP& fxpLocation ) -{ - bool bFlip; - if( point >= TessFactorCtx.numHalfTessFactorPoints ) - { - point = (TessFactorCtx.numHalfTessFactorPoints << 1) - point; - if( Odd() ) - { - point -= 1; - } - bFlip = true; - } - else - { - bFlip = false; - } - if( point == TessFactorCtx.numHalfTessFactorPoints ) - { - fxpLocation = FXP_ONE_HALF; // special casing middle since 16 bit fixed math below can't reproduce 0.5 exactly - return; - } - unsigned int indexOnCeilHalfTessFactor = point; - unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor; - if( point > TessFactorCtx.splitPointOnFloorHalfTessFactor ) - { - indexOnFloorHalfTessFactor -= 1; - } - // For the fixed point multiplies below, we know the results are <= 16 bits because - // the locations on the halfTessFactor are <= half the number of segments for the total TessFactor. - // So a number divided by a number that is at least twice as big will give - // a result no bigger than 0.5 (which in fixed point is 16 bits in our case) - FXP fxpLocationOnFloorHalfTessFactor = indexOnFloorHalfTessFactor * TessFactorCtx.fxpInvNumSegmentsOnFloorTessFactor; - FXP fxpLocationOnCeilHalfTessFactor = indexOnCeilHalfTessFactor * TessFactorCtx.fxpInvNumSegmentsOnCeilTessFactor; - - // Since we know the numbers calculated above are <= fixed point 0.5, and the equation - // below is just lerping between two values <= fixed point 0.5 (0x00008000), then we know - // that the final result before shifting by 16 bits is no larger than 0x80000000. Once we - // shift that down by 16, we get the result of lerping 2 numbers <= 0.5, which is obviously - // at most 0.5 (0x00008000) - fxpLocation = fxpLocationOnFloorHalfTessFactor * (FXP_ONE - TessFactorCtx.fxpHalfTessFactorFraction) + - fxpLocationOnCeilHalfTessFactor * (TessFactorCtx.fxpHalfTessFactorFraction); - fxpLocation = (fxpLocation + FXP_ONE_HALF/*round*/) >> FXP_FRACTION_BITS; // get back to n.16 - /* Commenting out floating point version. Note the parameter cleansing it does is not needed in fixed point. - if( bFlip ) - location = 1.0f - location; // complement produces cleansed result. - else - CleanseParameter(location); - */ - if( bFlip ) - { - fxpLocation = FXP_ONE - fxpLocation; - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::StitchRegular -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::StitchRegular(bool bTrapezoid,DIAGONALS diagonals, - int baseIndexOffset, int numInsideEdgePoints, - int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset) -{ - int insidePoint = insideEdgePointBaseOffset; - int outsidePoint = outsideEdgePointBaseOffset; - if( bTrapezoid ) - { - DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); - baseIndexOffset += 3; outsidePoint++; - } - int p; - switch( diagonals ) - { - case DIAGONALS_INSIDE_TO_OUTSIDE: - // Diagonals pointing from inside edge forward towards outside edge - for( p = 0; p < numInsideEdgePoints-1; p++ ) - { - DefineClockwiseTriangle(insidePoint,outsidePoint,outsidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - - DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; outsidePoint++; - } - break; - case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation - // Diagonals pointing from outside edge forward towards inside edge - - // First half - for( p = 0; p < numInsideEdgePoints/2-1; p++ ) - { - DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); - baseIndexOffset += 3; - DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; outsidePoint++; - } - - // Middle - DefineClockwiseTriangle(outsidePoint,insidePoint+1,insidePoint,baseIndexOffset); - baseIndexOffset += 3; - DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; outsidePoint++; p+=2; - - // Second half - for( ; p < numInsideEdgePoints; p++ ) - { - DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); - baseIndexOffset += 3; - DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; outsidePoint++; - } - break; - case DIAGONALS_MIRRORED: - // First half, diagonals pointing from outside of outside edge to inside of inside edge - for( p = 0; p < numInsideEdgePoints/2; p++ ) - { - DefineClockwiseTriangle(outsidePoint,insidePoint+1,insidePoint,baseIndexOffset); - baseIndexOffset += 3; - DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; outsidePoint++; - } - // Second half, diagonals pointing from inside of inside edge to outside of outside edge - for( ; p < numInsideEdgePoints-1; p++ ) - { - DefineClockwiseTriangle(insidePoint,outsidePoint,outsidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; outsidePoint++; - } - break; - } - if( bTrapezoid ) - { - DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); - baseIndexOffset += 3; - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::StitchTransition() -//--------------------------------------------------------------------------------------------------------------------------------- -void CHWTessellator::StitchTransition(int baseIndexOffset, - int insideEdgePointBaseOffset, int insideNumHalfTessFactorPoints, - TESSELLATOR_PARITY insideEdgeTessFactorParity, - int outsideEdgePointBaseOffset, int outsideNumHalfTessFactorPoints, - TESSELLATOR_PARITY outsideTessFactorParity -) -{ - -#ifdef ALLOW_XBOX_360_COMPARISON - // Tables to assist in the stitching of 2 rows of points having arbitrary TessFactors. - // The stitching order is governed by Ruler Function vertex split ordering (see external documentation). - // - // The contents of the finalPointPositionTable are where vertex i [0..32] ends up on the half-edge - // at the max tessellation amount given ruler-function split order. - // Recall the other half of an edge is mirrored, so we only need to deal with one half. - // This table is used to decide when to advance a point on the interior or exterior. - // It supports odd TessFactor up to 65 and even TessFactor up to 64. - static const int _finalPointPositionTable[33] = - { 0, 32, 16, 8, 17, 4, 18, 9, 19, 2, 20, 10, 21, 5, 22, 11, 23, - 1, 24, 12, 25, 6, 26, 13, 27, 3, 28, 14, 29, 7, 30, 15, 31 }; - // The loopStart and loopEnd tables below just provide optimal loop bounds for the - // stitching algorithm further below, for any given halfTssFactor. - // There is probably a better way to encode this... - - // loopStart[halfTessFactor] encodes the FIRST entry other that [0] in finalPointPositionTable[] above which is - // less than halfTessFactor. Exceptions are entry 0 and 1, which are set up to skip the loop. - static const int _loopStart[33] = - {1,1,17,9,9,5,5,5,5,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2}; - // loopStart[halfTessFactor] encodes the LAST entry in finalPointPositionTable[] above which is - // less than halfTessFactor. Exceptions are entry 0 and 1, which are set up to skip the loop. - static const int _loopEnd[33] = - {0,0,17,17,25,25,25,25,29,29,29,29,29,29,29,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,32}; - const int* finalPointPositionTable; - const int* loopStart; - const int* loopEnd; - if( m_bXBox360Mode ) - { - // The XBox360 vertex introduction order is always from the center of the edge. - // So the final positions of points on the half-edge are this trivial table. - static const int XBOXfinalPointPositionTable[33] = - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 }; - // loopStart and loopEnd (meaning described above) also become trivial for XBox360 splitting. - static const int XBOXloopStart[33] = - {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; - static const int XBOXloopEnd[33] = - {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; - - finalPointPositionTable = XBOXfinalPointPositionTable; - loopStart = XBOXloopStart; - loopEnd = XBOXloopEnd; - } - else - { - finalPointPositionTable = _finalPointPositionTable; - loopStart = _loopStart; - loopEnd =_loopEnd; - } -#else - // Tables to assist in the stitching of 2 rows of points having arbitrary TessFactors. - // The stitching order is governed by Ruler Function vertex split ordering (see external documentation). - // - // The contents of the finalPointPositionTable are where vertex i [0..33] ends up on the half-edge - // at the max tessellation amount given ruler-function split order. - // Recall the other half of an edge is mirrored, so we only need to deal with one half. - // This table is used to decide when to advance a point on the interior or exterior. - // It supports odd TessFactor up to 65 and even TessFactor up to 64. - static const int finalPointPositionTable[33] = - { 0, 32, 16, 8, 17, 4, 18, 9, 19, 2, 20, 10, 21, 5, 22, 11, 23, - 1, 24, 12, 25, 6, 26, 13, 27, 3, 28, 14, 29, 7, 30, 15, 31 }; - - // The loopStart and loopEnd tables below just provide optimal loop bounds for the - // stitching algorithm further below, for any given halfTssFactor. - // There is probably a better way to encode this... - - // loopStart[halfTessFactor] encodes the FIRST entry in finalPointPositionTable[] above which is - // less than halfTessFactor. Exceptions are entry 0 and 1, which are set up to skip the loop. - static const int loopStart[33] = - {1,1,17,9,9,5,5,5,5,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2}; - // loopStart[halfTessFactor] encodes the LAST entry in finalPointPositionTable[] above which is - // less than halfTessFactor. Exceptions are entry 0 and 1, which are set up to skip the loop. - static const int loopEnd[33] = - {0,0,17,17,25,25,25,25,29,29,29,29,29,29,29,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,32}; -#endif - if( TESSELLATOR_PARITY_ODD == insideEdgeTessFactorParity ) - { - insideNumHalfTessFactorPoints -= 1; - } - if( TESSELLATOR_PARITY_ODD == outsideTessFactorParity ) - { - outsideNumHalfTessFactorPoints -= 1; - } - // Walk first half - int outsidePoint = outsideEdgePointBaseOffset; - int insidePoint = insideEdgePointBaseOffset; - - // iStart,iEnd are a small optimization so the loop below doesn't have to go from 0 up to 31 - int iStart = min(loopStart[insideNumHalfTessFactorPoints],loopStart[outsideNumHalfTessFactorPoints]); - int iEnd = max(loopEnd[insideNumHalfTessFactorPoints],loopEnd[outsideNumHalfTessFactorPoints]); - - if( finalPointPositionTable[0] < outsideNumHalfTessFactorPoints ) // since we dont' start the loop at 0 below, we need a special case. - { - // Advance outside - DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); - baseIndexOffset += 3; outsidePoint++; - } - - for(int i = iStart; i <= iEnd; i++) - { - if( /*(i>0) && <-- not needed since iStart is never 0*/(finalPointPositionTable[i] < insideNumHalfTessFactorPoints)) - { - // Advance inside - DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset); - baseIndexOffset += 3; insidePoint++; - } - if((finalPointPositionTable[i] < outsideNumHalfTessFactorPoints)) - { - // Advance outside - DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); - baseIndexOffset += 3; outsidePoint++; - } - } - - if( (insideEdgeTessFactorParity != outsideTessFactorParity) || (insideEdgeTessFactorParity == TESSELLATOR_PARITY_ODD)) - { - if( insideEdgeTessFactorParity == outsideTessFactorParity ) - { - // Quad in the middle - DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - DefineClockwiseTriangle(insidePoint+1,outsidePoint,outsidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - outsidePoint++; - } - else if( TESSELLATOR_PARITY_EVEN == insideEdgeTessFactorParity ) - { - // Triangle pointing inside - DefineClockwiseTriangle(insidePoint,outsidePoint,outsidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - outsidePoint++; - } - else - { - // Triangle pointing outside - DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset); - baseIndexOffset += 3; - insidePoint++; - } - } - - // Walk second half. - for(int i = iEnd; i >= iStart; i--) - { - if((finalPointPositionTable[i] < outsideNumHalfTessFactorPoints)) - { - // Advance outside - DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); - baseIndexOffset += 3; outsidePoint++; - } - if( /*(i>0) && <-- not needed since iStart is never 0*/ (finalPointPositionTable[i] < insideNumHalfTessFactorPoints)) - { - // Advance inside - DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset); - baseIndexOffset += 3; insidePoint++; - } - } - // Below case is not needed if we didn't optimize loop above and made it run from 31 down to 0. - if((finalPointPositionTable[0] < outsideNumHalfTessFactorPoints)) - { - DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); - baseIndexOffset += 3; outsidePoint++; - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHWTessellator::PatchIndexValue() -//-------------------------------------------------------------------------------------------------------------------------------- -int CHWTessellator::PatchIndexValue(int index) -{ - if( m_bUsingPatchedIndices ) - { - if( index >= m_IndexPatchContext.outsidePointIndexPatchBase ) // assumed remapped outide indices are > remapped inside vertices - { - if( index == m_IndexPatchContext.outsidePointIndexBadValue ) - index = m_IndexPatchContext.outsidePointIndexReplacementValue; - else - index += m_IndexPatchContext.outsidePointIndexDeltaToRealValue; - } - else - { - if( index == m_IndexPatchContext.insidePointIndexBadValue ) - index = m_IndexPatchContext.insidePointIndexReplacementValue; - else - index += m_IndexPatchContext.insidePointIndexDeltaToRealValue; - } - } - else if( m_bUsingPatchedIndices2 ) - { - if( index >= m_IndexPatchContext2.baseIndexToInvert ) - { - if( index == m_IndexPatchContext2.cornerCaseBadValue ) - { - index = m_IndexPatchContext2.cornerCaseReplacementValue; - } - else - { - index = m_IndexPatchContext2.indexInversionEndPoint - index; - } - } - else if( index == m_IndexPatchContext2.cornerCaseBadValue ) - { - index = m_IndexPatchContext2.cornerCaseReplacementValue; - } - } - return index; -} - - -//================================================================================================================================= -// CHLSLTessellator -//================================================================================================================================= - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::CHLSLTessellator -//--------------------------------------------------------------------------------------------------------------------------------- -CHLSLTessellator::CHLSLTessellator() -{ - m_LastComputedTessFactors[0] = m_LastComputedTessFactors[1] = m_LastComputedTessFactors[2] = - m_LastComputedTessFactors[3] = m_LastComputedTessFactors[4] = m_LastComputedTessFactors[5] = 0; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::Init -// User calls this. -//--------------------------------------------------------------------------------------------------------------------------------- -void CHLSLTessellator::Init( - D3D11_TESSELLATOR_PARTITIONING partitioning, - D3D11_TESSELLATOR_REDUCTION insideTessFactorReduction, - D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS quadInsideTessFactorReductionAxis, - D3D11_TESSELLATOR_OUTPUT_PRIMITIVE outputPrimitive) -{ - CHWTessellator::Init(partitioning,outputPrimitive); - m_LastComputedTessFactors[0] = m_LastComputedTessFactors[1] = m_LastComputedTessFactors[2] = - m_LastComputedTessFactors[3] = m_LastComputedTessFactors[4] = m_LastComputedTessFactors[5] = 0; - m_partitioning = partitioning; - m_originalPartitioning = partitioning; - switch( partitioning ) - { - case D3D11_TESSELLATOR_PARTITIONING_INTEGER: - default: - break; - case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: - m_parity = TESSELLATOR_PARITY_ODD; - break; - case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: - m_parity = TESSELLATOR_PARITY_EVEN; - break; - } - m_originalParity = m_parity; - m_outputPrimitive = outputPrimitive; - m_insideTessFactorReduction = insideTessFactorReduction; - m_quadInsideTessFactorReductionAxis = quadInsideTessFactorReductionAxis; -} -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::TessellateQuadDomain -// User calls this -//--------------------------------------------------------------------------------------------------------------------------------- -void CHLSLTessellator::TessellateQuadDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1, - float insideTessFactorScaleU, float insideTessFactorScaleV ) -{ - QuadHLSLProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Ueq1,tessFactor_Veq1,insideTessFactorScaleU,insideTessFactorScaleV); - - CHWTessellator::TessellateQuadDomain(m_LastComputedTessFactors[0],m_LastComputedTessFactors[1],m_LastComputedTessFactors[2],m_LastComputedTessFactors[3], - m_LastComputedTessFactors[4],m_LastComputedTessFactors[5]); -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::QuadHLSLProcessTessFactors -//--------------------------------------------------------------------------------------------------------------------------------- -void CHLSLTessellator::QuadHLSLProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1, - float insideTessFactorScaleU, float insideTessFactorScaleV ) -{ - if( !(tessFactor_Ueq0 > 0) ||// NaN will pass - !(tessFactor_Veq0 > 0) || - !(tessFactor_Ueq1 > 0) || - !(tessFactor_Veq1 > 0) ) - { - m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0; - m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0; - m_LastUnRoundedComputedTessFactors[2] = tessFactor_Ueq1; - m_LastUnRoundedComputedTessFactors[3] = tessFactor_Veq1; - m_LastUnRoundedComputedTessFactors[4] = 0; - m_LastUnRoundedComputedTessFactors[5] = 0; - m_LastComputedTessFactors[0] = - m_LastComputedTessFactors[1] = - m_LastComputedTessFactors[2] = - m_LastComputedTessFactors[3] = - m_LastComputedTessFactors[4] = - m_LastComputedTessFactors[5] = 0; - return; - } - - CleanupFloatTessFactor(tessFactor_Ueq0);// clamp to [1.0f..INF], NaN->1.0f - CleanupFloatTessFactor(tessFactor_Veq0); - CleanupFloatTessFactor(tessFactor_Ueq1); - CleanupFloatTessFactor(tessFactor_Veq1); - - // Save off tessFactors so they can be returned to app - m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0; - m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0; - m_LastUnRoundedComputedTessFactors[2] = tessFactor_Ueq1; - m_LastUnRoundedComputedTessFactors[3] = tessFactor_Veq1; - - // Process outside tessFactors - float outsideTessFactor[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Ueq1, tessFactor_Veq1}; - int edge, axis; - TESSELLATOR_PARITY insideTessFactorParity[QUAD_AXES]; - if( Pow2Partitioning() || IntegerPartitioning() ) - { - for( edge = 0; edge < QUAD_EDGES; edge++ ) - { - RoundUpTessFactor(outsideTessFactor[edge]); - ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode - } - } - else - { - SetTessellationParity(m_originalParity); // ClampTessFactor needs it - for( edge = 0; edge < QUAD_EDGES; edge++ ) - { - ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode - } - } - - // Compute inside TessFactors - float insideTessFactor[QUAD_AXES] = {0.0}; - if( m_quadInsideTessFactorReductionAxis == D3D11_TESSELLATOR_QUAD_REDUCTION_1_AXIS ) - { - switch( m_insideTessFactorReduction ) - { - case D3D11_TESSELLATOR_REDUCTION_MIN: - insideTessFactor[U] = tess_fmin(tess_fmin(tessFactor_Veq0,tessFactor_Veq1),tess_fmin(tessFactor_Ueq0,tessFactor_Ueq1)); - break; - case D3D11_TESSELLATOR_REDUCTION_MAX: - insideTessFactor[U] = tess_fmax(tess_fmax(tessFactor_Veq0,tessFactor_Veq1),tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1)); - break; - case D3D11_TESSELLATOR_REDUCTION_AVERAGE: - insideTessFactor[U] = (tessFactor_Veq0 + tessFactor_Veq1 + tessFactor_Ueq0 + tessFactor_Ueq1) / 4; - break; - } - // Scale inside tessFactor based on user scale factor. - - ClampFloatTessFactorScale(insideTessFactorScaleU); // clamp scale value to [0..1], NaN->0 - insideTessFactor[U] = insideTessFactor[U]*insideTessFactorScaleU; - - // Compute inside parity - if( Pow2Partitioning() || IntegerPartitioning() ) - { - ClampTessFactor(insideTessFactor[U]); // clamp reduction + scale result that is based on unbounded user input - m_LastUnRoundedComputedTessFactors[4] = m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app - RoundUpTessFactor(insideTessFactor[U]); - insideTessFactorParity[U] = - insideTessFactorParity[V] = - (isEven(insideTessFactor[U]) || (FLOAT_ONE == insideTessFactor[U]) ) - ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - else - { - ClampTessFactor(insideTessFactor[U]); // clamp reduction + scale result that is based on unbounded user input - m_LastUnRoundedComputedTessFactors[4] = m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app - // no parity changes for fractional tessellation - just use what the user requested - insideTessFactorParity[U] = insideTessFactorParity[V] = m_originalParity; - } - - // To prevent snapping on edges, the "picture frame" comes - // in using avg or max (and ignore inside TessFactor scaling) until it is at least 3. - if( (TESSELLATOR_PARITY_ODD == insideTessFactorParity[U]) && - (insideTessFactor[U] < FLOAT_THREE) ) - { - if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction) - { - insideTessFactor[U] = tess_fmin(FLOAT_THREE,tess_fmax(tess_fmax(tessFactor_Veq0,tessFactor_Veq1),tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1))); - } - else - { - insideTessFactor[U] = tess_fmin(FLOAT_THREE,(tessFactor_Veq0 + tessFactor_Veq1 + tessFactor_Ueq0 + tessFactor_Ueq1) / 4); - } - ClampTessFactor(insideTessFactor[U]); // clamp reduction result that is based on unbounded user input - m_LastUnRoundedComputedTessFactors[4] = m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app - if( IntegerPartitioning()) - { - RoundUpTessFactor(insideTessFactor[U]); - insideTessFactorParity[U] = - insideTessFactorParity[V] = isEven(insideTessFactor[U]) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - } - insideTessFactor[V] = insideTessFactor[U]; - } - else - { - switch( m_insideTessFactorReduction ) - { - case D3D11_TESSELLATOR_REDUCTION_MIN: - insideTessFactor[U] = tess_fmin(tessFactor_Veq0,tessFactor_Veq1); - insideTessFactor[V] = tess_fmin(tessFactor_Ueq0,tessFactor_Ueq1); - break; - case D3D11_TESSELLATOR_REDUCTION_MAX: - insideTessFactor[U] = tess_fmax(tessFactor_Veq0,tessFactor_Veq1); - insideTessFactor[V] = tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1); - break; - case D3D11_TESSELLATOR_REDUCTION_AVERAGE: - insideTessFactor[U] = (tessFactor_Veq0 + tessFactor_Veq1) / 2; - insideTessFactor[V] = (tessFactor_Ueq0 + tessFactor_Ueq1) / 2; - break; - } - // Scale inside tessFactors based on user scale factor. - - ClampFloatTessFactorScale(insideTessFactorScaleU); // clamp scale value to [0..1], NaN->0 - ClampFloatTessFactorScale(insideTessFactorScaleV); - insideTessFactor[U] = insideTessFactor[U]*insideTessFactorScaleU; - insideTessFactor[V] = insideTessFactor[V]*insideTessFactorScaleV; - - // Compute inside parity - if( Pow2Partitioning() || IntegerPartitioning() ) - { - for( axis = 0; axis < QUAD_AXES; axis++ ) - { - ClampTessFactor(insideTessFactor[axis]); // clamp reduction + scale result that is based on unbounded user input - m_LastUnRoundedComputedTessFactors[4+axis] = insideTessFactor[axis]; // Save off TessFactors so they can be returned to app - RoundUpTessFactor(insideTessFactor[axis]); - insideTessFactorParity[axis] = - (isEven(insideTessFactor[axis]) || (FLOAT_ONE == insideTessFactor[axis]) ) - ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - } - else - { - ClampTessFactor(insideTessFactor[U]); // clamp reduction + scale result that is based on unbounded user input - ClampTessFactor(insideTessFactor[V]); // clamp reduction + scale result that is based on unbounded user input - m_LastUnRoundedComputedTessFactors[4] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app - m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[V]; // Save off TessFactors so they can be returned to app - // no parity changes for fractional tessellation - just use what the user requested - insideTessFactorParity[U] = insideTessFactorParity[V] = m_originalParity; - } - - // To prevent snapping on edges, the "picture frame" comes - // in using avg or max (and ignore inside TessFactor scaling) until it is at least 3. - if( (TESSELLATOR_PARITY_ODD == insideTessFactorParity[U]) && - (insideTessFactor[U] < FLOAT_THREE) ) - { - if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction) - { - insideTessFactor[U] = tess_fmin(FLOAT_THREE,tess_fmax(tessFactor_Veq0,tessFactor_Veq1)); - } - else - { - insideTessFactor[U] = tess_fmin(FLOAT_THREE,(tessFactor_Veq0 + tessFactor_Veq1) / 2); - } - ClampTessFactor(insideTessFactor[U]); // clamp reduction result that is based on unbounded user input - m_LastUnRoundedComputedTessFactors[4] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app - if( IntegerPartitioning()) - { - RoundUpTessFactor(insideTessFactor[U]); - insideTessFactorParity[U] = isEven(insideTessFactor[U]) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - } - - if( (TESSELLATOR_PARITY_ODD == insideTessFactorParity[V]) && - (insideTessFactor[V] < FLOAT_THREE) ) - { - if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction) - { - insideTessFactor[V] = tess_fmin(FLOAT_THREE,tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1)); - } - else - { - insideTessFactor[V] = tess_fmin(FLOAT_THREE,(tessFactor_Ueq0 + tessFactor_Ueq1) / 2); - } - ClampTessFactor(insideTessFactor[V]);// clamp reduction result that is based on unbounded user input - m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[V]; // Save off TessFactors so they can be returned to app - if( IntegerPartitioning()) - { - RoundUpTessFactor(insideTessFactor[V]); - insideTessFactorParity[V] = isEven(insideTessFactor[V]) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - } - - for( axis = 0; axis < QUAD_AXES; axis++ ) - { - if( TESSELLATOR_PARITY_ODD == insideTessFactorParity[axis] ) - { - // Ensure the first ring ("picture frame") interpolates in on all sides - // as much as the side with the minimum TessFactor. Prevents snapping to edge. - if( (insideTessFactor[axis] < FLOAT_THREE) && (insideTessFactor[axis] < insideTessFactor[(axis+1)&0x1])) - { - insideTessFactor[axis] = tess_fmin(insideTessFactor[(axis+1)&0x1],FLOAT_THREE); - m_LastUnRoundedComputedTessFactors[4+axis] = insideTessFactor[axis]; // Save off TessFactors so they can be returned to app - } - } - } - } - - // Save off TessFactors so they can be returned to app - m_LastComputedTessFactors[0] = outsideTessFactor[Ueq0]; - m_LastComputedTessFactors[1] = outsideTessFactor[Veq0]; - m_LastComputedTessFactors[2] = outsideTessFactor[Ueq1]; - m_LastComputedTessFactors[3] = outsideTessFactor[Veq1]; - m_LastComputedTessFactors[4] = insideTessFactor[U]; - m_LastComputedTessFactors[5] = insideTessFactor[V]; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::TessellateTriDomain -// User calls this -//--------------------------------------------------------------------------------------------------------------------------------- -void CHLSLTessellator::TessellateTriDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0, - float insideTessFactorScale ) -{ - TriHLSLProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Weq0,insideTessFactorScale); - - CHWTessellator::TessellateTriDomain(m_LastComputedTessFactors[0],m_LastComputedTessFactors[1],m_LastComputedTessFactors[2],m_LastComputedTessFactors[3]); -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::TriHLSLProcessTessFactors -//--------------------------------------------------------------------------------------------------------------------------------- -void CHLSLTessellator::TriHLSLProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0, - float insideTessFactorScale ) -{ - if( !(tessFactor_Ueq0 > 0) || // NaN will pass - !(tessFactor_Veq0 > 0) || - !(tessFactor_Weq0 > 0) ) - { - m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0; - m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0; - m_LastUnRoundedComputedTessFactors[2] = tessFactor_Weq0; - m_LastUnRoundedComputedTessFactors[3] = - m_LastComputedTessFactors[0] = - m_LastComputedTessFactors[1] = - m_LastComputedTessFactors[2] = - m_LastComputedTessFactors[3] = 0; - return; - } - - CleanupFloatTessFactor(tessFactor_Ueq0); // clamp to [1.0f..INF], NaN->1.0f - CleanupFloatTessFactor(tessFactor_Veq0); - CleanupFloatTessFactor(tessFactor_Weq0); - - // Save off TessFactors so they can be returned to app - m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0; - m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0; - m_LastUnRoundedComputedTessFactors[2] = tessFactor_Weq0; - - // Process outside TessFactors - float outsideTessFactor[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Weq0}; - int edge; - if( Pow2Partitioning() || IntegerPartitioning() ) - { - for( edge = 0; edge < TRI_EDGES; edge++ ) - { - RoundUpTessFactor(outsideTessFactor[edge]); // for pow2 this rounds to pow2 - ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode - } - } - else - { - for( edge = 0; edge < TRI_EDGES; edge++ ) - { - ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode - } - } - - // Compute inside TessFactor - float insideTessFactor = 0.0; - switch( m_insideTessFactorReduction ) - { - case D3D11_TESSELLATOR_REDUCTION_MIN: - insideTessFactor = tess_fmin(tess_fmin(tessFactor_Ueq0,tessFactor_Veq0),tessFactor_Weq0); - break; - case D3D11_TESSELLATOR_REDUCTION_MAX: - insideTessFactor = tess_fmax(tess_fmax(tessFactor_Ueq0,tessFactor_Veq0),tessFactor_Weq0); - break; - case D3D11_TESSELLATOR_REDUCTION_AVERAGE: - insideTessFactor = (tessFactor_Ueq0 + tessFactor_Veq0 + tessFactor_Weq0) / 3; - break; - } - - // Scale inside TessFactor based on user scale factor. - ClampFloatTessFactorScale(insideTessFactorScale); // clamp scale value to [0..1], NaN->0 - insideTessFactor = insideTessFactor*tess_fmin(FLOAT_ONE,insideTessFactorScale); - - ClampTessFactor(insideTessFactor); // clamp reduction + scale result that is based on unbounded user input - m_LastUnRoundedComputedTessFactors[3] = insideTessFactor;// Save off TessFactors so they can be returned to app - TESSELLATOR_PARITY parity; - if( Pow2Partitioning() || IntegerPartitioning() ) - { - RoundUpTessFactor(insideTessFactor); - parity = (isEven(insideTessFactor) || (FLOAT_ONE == insideTessFactor)) - ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; - } - else - { - parity = m_originalParity; - } - - if( (TESSELLATOR_PARITY_ODD == parity) && - (insideTessFactor < FLOAT_THREE)) - { - // To prevent snapping on edges, the "picture frame" comes - // in using avg or max (and ignore inside TessFactor scaling) until it is at least 3. - if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction) - { - insideTessFactor = tess_fmin(FLOAT_THREE,tess_fmax(tessFactor_Ueq0,tess_fmax(tessFactor_Veq0,tessFactor_Weq0))); - } - else - { - insideTessFactor = tess_fmin(FLOAT_THREE,(tessFactor_Ueq0 + tessFactor_Veq0 + tessFactor_Weq0) / 3); - } - ClampTessFactor(insideTessFactor); // clamp reduction result that is based on unbounded user input - m_LastUnRoundedComputedTessFactors[3] = insideTessFactor;// Save off TessFactors so they can be returned to app - if( IntegerPartitioning()) - { - RoundUpTessFactor(insideTessFactor); - } - } - - // Save off TessFactors so they can be returned to app - m_LastComputedTessFactors[0] = outsideTessFactor[Ueq0]; - m_LastComputedTessFactors[1] = outsideTessFactor[Veq0]; - m_LastComputedTessFactors[2] = outsideTessFactor[Weq0]; - m_LastComputedTessFactors[3] = insideTessFactor; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::TessellateIsoLineDomain -// User calls this. -//--------------------------------------------------------------------------------------------------------------------------------- -void CHLSLTessellator::TessellateIsoLineDomain( float TessFactor_U_LineDetail, float TessFactor_V_LineDensity ) -{ - IsoLineHLSLProcessTessFactors(TessFactor_V_LineDensity,TessFactor_U_LineDetail); - CHWTessellator::TessellateIsoLineDomain(m_LastComputedTessFactors[0],m_LastComputedTessFactors[1]); -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::IsoLineHLSLProcessTessFactors -//--------------------------------------------------------------------------------------------------------------------------------- -void CHLSLTessellator::IsoLineHLSLProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail ) -{ - if( !(TessFactor_V_LineDensity > 0) || // NaN will pass - !(TessFactor_U_LineDetail > 0) ) - { - m_LastUnRoundedComputedTessFactors[0] = TessFactor_V_LineDensity; - m_LastUnRoundedComputedTessFactors[1] = TessFactor_U_LineDetail; - m_LastComputedTessFactors[0] = - m_LastComputedTessFactors[1] = 0; - return; - } - - CleanupFloatTessFactor(TessFactor_V_LineDensity); // clamp to [1.0f..INF], NaN->1.0f - CleanupFloatTessFactor(TessFactor_U_LineDetail); // clamp to [1.0f..INF], NaN->1.0f - - ClampTessFactor(TessFactor_U_LineDetail); // clamp unbounded user input based on tessellation mode - - m_LastUnRoundedComputedTessFactors[1] = TessFactor_U_LineDetail; // Save off TessFactors so they can be returned to app - - if(Pow2Partitioning()||IntegerPartitioning()) - { - RoundUpTessFactor(TessFactor_U_LineDetail); - } - - OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING_INTEGER); - - ClampTessFactor(TessFactor_V_LineDensity); // Clamp unbounded user input to integer - m_LastUnRoundedComputedTessFactors[0] = TessFactor_V_LineDensity; // Save off TessFactors so they can be returned to app - - RoundUpTessFactor(TessFactor_V_LineDensity); - - RestorePartitioning(); - - // Save off TessFactors so they can be returned to app - m_LastComputedTessFactors[0] = TessFactor_V_LineDensity; - m_LastComputedTessFactors[1] = TessFactor_U_LineDetail; -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::ClampTessFactor() -//--------------------------------------------------------------------------------------------------------------------------------- -void CHLSLTessellator::ClampTessFactor(float& TessFactor) -{ - if( Pow2Partitioning() ) - { - TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR) ); - } - else if( IntegerPartitioning() ) - { - TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR) ); - } - else if( Odd() ) - { - TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR) ); - } - else // even - { - TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR) ); - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::CleanupFloatTessFactor() -//--------------------------------------------------------------------------------------------------------------------------------- -static const int exponentMask = 0x7f800000; -static const int mantissaMask = 0x007fffff; -void CHLSLTessellator::CleanupFloatTessFactor(float& input) -{ - // If input is < 1.0f or NaN, clamp to 1.0f. - // In other words, clamp input to [1.0f...+INF] - int bits = *(int*)&input; - if( ( ( ( bits & exponentMask ) == exponentMask ) && ( bits & mantissaMask ) ) ||// nan? - (input < 1.0f) ) - { - input = 1; - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::ClampFloatTessFactorScale() -//--------------------------------------------------------------------------------------------------------------------------------- -void CHLSLTessellator::ClampFloatTessFactorScale(float& input) -{ - // If input is < 0.0f or NaN, clamp to 0.0f. > 1 clamps to 1. - // In other words, clamp input to [0.0f...1.0f] - int bits = *(int*)&input; - if( ( ( ( bits & exponentMask ) == exponentMask ) && ( bits & mantissaMask ) ) ||// nan? - (input < 0.0f) ) - { - input = 0; - } - else if( input > 1 ) - { - input = 1; - } -} - -//--------------------------------------------------------------------------------------------------------------------------------- -// CHLSLTessellator::RoundUpTessFactor() -//--------------------------------------------------------------------------------------------------------------------------------- -static const int exponentLSB = 0x00800000; -void CHLSLTessellator::RoundUpTessFactor(float& TessFactor) -{ - // Assume TessFactor is in [1.0f..+INF] - if( Pow2Partitioning() ) - { - int bits = *(int*)&TessFactor; - if( bits & mantissaMask ) - { - *(int*)&TessFactor = (bits & exponentMask) + exponentLSB; - } - } - else if( IntegerPartitioning() ) - { - TessFactor = ceil(TessFactor); - } -} diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.h b/src/gallium/drivers/swr/rasterizer/core/tessellator.h deleted file mode 100644 index 30b6b4fca1e..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/tessellator.h +++ /dev/null @@ -1,202 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2019 without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file tessellator.h - * - * @brief Tessellator fixed function unit interface definition - * - ******************************************************************************/ -#pragma once - -#include "tessellator.hpp" - -struct SWR_TS_TESSELLATED_DATA -{ - uint32_t NumPrimitives; - uint32_t NumDomainPoints; - - uint32_t* ppIndices[3]; - float* pDomainPointsU; - float* pDomainPointsV; - // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i] -}; - -namespace Tessellator -{ - /// Wrapper class for the CHWTessellator reference tessellator from MSFT - /// This class will store data not originally stored in CHWTessellator - class SWR_TS : private CHWTessellator - { - private: - typedef CHWTessellator SUPER; - SWR_TS_DOMAIN Domain; - OSALIGNSIMD(float) DomainPointsU[MAX_POINT_COUNT]; - OSALIGNSIMD(float) DomainPointsV[MAX_POINT_COUNT]; - uint32_t NumDomainPoints; - OSALIGNSIMD(uint32_t) Indices[3][MAX_INDEX_COUNT / 3]; - uint32_t NumIndices; - - public: - void Init(SWR_TS_DOMAIN tsDomain, - SWR_TS_PARTITIONING tsPartitioning, - SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology) - { - static D3D11_TESSELLATOR_PARTITIONING CVT_TS_D3D_PARTITIONING[] = { - D3D11_TESSELLATOR_PARTITIONING_INTEGER, // SWR_TS_INTEGER - D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD, // SWR_TS_ODD_FRACTIONAL - D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN, // SWR_TS_EVEN_FRACTIONAL - D3D11_TESSELLATOR_PARTITIONING_POW2 // SWR_TS_POW2 - }; - - static D3D11_TESSELLATOR_OUTPUT_PRIMITIVE CVT_TS_D3D_OUTPUT_TOPOLOGY[] = { - D3D11_TESSELLATOR_OUTPUT_POINT, // SWR_TS_OUTPUT_POINT - D3D11_TESSELLATOR_OUTPUT_LINE, // SWR_TS_OUTPUT_LINE - D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW, // SWR_TS_OUTPUT_TRI_CW - inverted logic, because DX - D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW // SWR_TS_OUTPUT_TRI_CCW - inverted logic, because DX - }; - - SUPER::Init(CVT_TS_D3D_PARTITIONING[tsPartitioning], - CVT_TS_D3D_OUTPUT_TOPOLOGY[tsOutputTopology]); - - Domain = tsDomain; - NumDomainPoints = 0; - NumIndices = 0; - } - - void Tessellate(const SWR_TESSELLATION_FACTORS& tsTessFactors, - SWR_TS_TESSELLATED_DATA& tsTessellatedData) - { - uint32_t IndexDiv = 0; - switch (Domain) - { - case SWR_TS_QUAD: - IndexDiv = 3; - SUPER::TessellateQuadDomain( - tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL], - tsTessFactors.OuterTessFactors[SWR_QUAD_V_EQ0_TRI_W], - tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY], - tsTessFactors.OuterTessFactors[SWR_QUAD_V_EQ1], - tsTessFactors.InnerTessFactors[SWR_QUAD_U_TRI_INSIDE], - tsTessFactors.InnerTessFactors[SWR_QUAD_V_INSIDE]); - break; - - case SWR_TS_TRI: - IndexDiv = 3; - SUPER::TessellateTriDomain( - tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL], - tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY], - tsTessFactors.OuterTessFactors[SWR_QUAD_V_EQ0_TRI_W], - tsTessFactors.InnerTessFactors[SWR_QUAD_U_TRI_INSIDE]); - break; - - case SWR_TS_ISOLINE: - IndexDiv = 2; - SUPER::TessellateIsoLineDomain( - tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY], - tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL]); - break; - - default: - SWR_INVALID("Invalid Tessellation Domain: %d", Domain); - assert(false); - } - - NumDomainPoints = (uint32_t)SUPER::GetPointCount(); - - DOMAIN_POINT* pPoints = SUPER::GetPoints(); - for (uint32_t i = 0; i < NumDomainPoints; i++) { - DomainPointsU[i] = pPoints[i].u; - DomainPointsV[i] = pPoints[i].v; - } - tsTessellatedData.NumDomainPoints = NumDomainPoints; - tsTessellatedData.pDomainPointsU = &DomainPointsU[0]; - tsTessellatedData.pDomainPointsV = &DomainPointsV[0]; - - NumIndices = (uint32_t)SUPER::GetIndexCount(); - - assert(NumIndices % IndexDiv == 0); - tsTessellatedData.NumPrimitives = NumIndices / IndexDiv; - - uint32_t* pIndices = (uint32_t*)SUPER::GetIndices(); - for (uint32_t i = 0; i < NumIndices; i++) { - Indices[i % IndexDiv][i / IndexDiv] = pIndices[i]; - } - - tsTessellatedData.ppIndices[0] = &Indices[0][0]; - tsTessellatedData.ppIndices[1] = &Indices[1][0]; - tsTessellatedData.ppIndices[2] = &Indices[2][0]; - } - }; -} // namespace Tessellator - -/// Allocate and initialize a new tessellation context -INLINE HANDLE SWR_API - TSInitCtx(SWR_TS_DOMAIN tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle) - SWR_TS_PARTITIONING tsPartitioning, ///< [IN] Tessellation partitioning algorithm - SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology - void* pContextMem, ///< [IN] Memory to use for the context - size_t& memSize) ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required -{ - using Tessellator::SWR_TS; - SWR_ASSERT(tsDomain < SWR_TS_DOMAIN_COUNT); - SWR_ASSERT(tsPartitioning < SWR_TS_PARTITIONING_COUNT); - SWR_ASSERT(tsOutputTopology < SWR_TS_OUTPUT_TOPOLOGY_COUNT); - - size_t origMemSize = memSize; - memSize = AlignUp(sizeof(SWR_TS), 64); - - if (nullptr == pContextMem || memSize > origMemSize) - { - return nullptr; - } - - HANDLE tsCtx = pContextMem; - - SWR_TS* pTessellator = new (tsCtx) SWR_TS(); - SWR_ASSERT(pTessellator == tsCtx); - - pTessellator->Init(tsDomain, tsPartitioning, tsOutputTopology); - - return tsCtx; -} - -/// Destroy & de-allocate tessellation context -INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx) ///< [IN] Tessellation context to be destroyed -{ - using Tessellator::SWR_TS; - SWR_TS* pTessellator = (SWR_TS*)tsCtx; - - if (pTessellator) - { - pTessellator->~SWR_TS(); - } -} - -/// Perform Tessellation -INLINE void SWR_API - TSTessellate(HANDLE tsCtx, ///< [IN] Tessellation Context - const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors - SWR_TS_TESSELLATED_DATA& tsTessellatedData) ///< [OUT] Tessellated Data -{ - using Tessellator::SWR_TS; - SWR_TS* pTessellator = (SWR_TS*)tsCtx; - SWR_ASSERT(pTessellator); - - pTessellator->Tessellate(tsTessFactors, tsTessellatedData); -} - diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.hpp b/src/gallium/drivers/swr/rasterizer/core/tessellator.hpp deleted file mode 100644 index 459c1093d2e..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/tessellator.hpp +++ /dev/null @@ -1,471 +0,0 @@ -/* - Copyright (c) Microsoft Corporation - - Permission is hereby granted, free of charge, to any person obtaining a copy of this software and - associated documentation files (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, - and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all copies or substantial - portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT - NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#pragma once -//================================================================================================================================= -// Microsoft D3D11 Fixed Function Tessellator Reference - May 7, 2012 -// amar.patel@microsoft.com -// -// CHWTessellator demonstrates what is expected of hardware in the D3D11 fixed function Tessellator stage. Hardware -// implementers need only look at this class. -// -// CHLSLTessellator is a wrapper for CHWTessellator, representing the effect of shader code that will -// be autogenerated by HLSL in the Hull Shader, both for plumbing data around, and to precondition TessFactor values before they -// are passed to the hardware (such as deriving inside TessFactors from edge TessFactors). The algorithms used -// in CHLSLTessellator are subject to change, but since they represent shader code auto-generated by the HLSL compiler, -// CHLSLTessellator has no effect on hardware design at all. Note the HLSL compiler will expose all the raw hardware -// control illustrated by CHWTessellator for those who don't need the helper functionality illustrated by CHLSLTessellator. -// -// Usage: (1) Create either a CHLSLTessellator or CHWTessellator object, depending on which you want to verify. -// (2) Call C*Tessellator::Init() -// (3) Call C*Tessellator::Tessellate[IsoLine|Tri|Quad]Domain() -// - Here you pass in TessFactors (how much to tessellate) -// (4) Call C*Tessellator::GetPointCount(), C*Tessellator::GetIndexCount() to see how much data was generated. -// (5) Call C*Tessellator::GetPoints() and C*Tessellator::GetIndices() to get pointers to the data. -// The pointers are fixed for the lifetime of the object (storage for max tessellation), -// so if you ::Tessellate again, the data in the buffers is overwritten. -// (6) There are various other Get() methods to retrieve TessFactors that have been processed from -// what you passed in at step 3. You can retrieve separate TessFactors that the tessellator -// produced after clamping but before rounding, and also after rounding (say in pow2 mode). -// These numbers can be useful information if you are geomorphing displacement maps. -// (7) Goto Step 2 or 3 if you want to animate TessFactors or tessellate a different patch -// -// Code implementation details: -// -// There is lots of headroom to make this code run faster on CPUs. It was written merely as a reference for -// what results hardware should produce, with CPU performance not a consideration. It is nice that this implementation -// only generates the exact number of vertices needed (no duplicates) in the output vertex buffer. Also, the number -// of calculations done for each U/V domain coordinate is minimized by doing some precalculation of some patch or edge -// invariant numbers (see TESS_FACTOR_CONTEXT). All the vertex coordinate calculations could be computed with as much -// parallelism as you like. Similarly the calculation of connectivity itself is highly parallelizable, and can also -// be done independent of the vertex calculations. -// -//================================================================================================================================= - -#define D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR 1 -#define D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR 63 -#define D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR 2 -#define D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR 64 - -#define D3D11_TESSELLATOR_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR 1 -#define D3D11_TESSELLATOR_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR 64 - -#define D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR 64 // max of even and odd tessFactors - -#define MAX_POINT_COUNT ((D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR+1)*(D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR+1)) -#define MAX_INDEX_COUNT (D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR*D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR*2*3) - -//================================================================================================================================= -// Data types for the caller -//================================================================================================================================= -enum D3D11_TESSELLATOR_PARTITIONING -{ - D3D11_TESSELLATOR_PARTITIONING_INTEGER, - D3D11_TESSELLATOR_PARTITIONING_POW2, - D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD, - D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN -}; - -enum D3D11_TESSELLATOR_REDUCTION -{ - D3D11_TESSELLATOR_REDUCTION_MIN, - D3D11_TESSELLATOR_REDUCTION_MAX, - D3D11_TESSELLATOR_REDUCTION_AVERAGE -}; - -enum D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS -{ - D3D11_TESSELLATOR_QUAD_REDUCTION_1_AXIS, - D3D11_TESSELLATOR_QUAD_REDUCTION_2_AXIS -}; - -enum D3D11_TESSELLATOR_OUTPUT_PRIMITIVE -{ - D3D11_TESSELLATOR_OUTPUT_POINT, - D3D11_TESSELLATOR_OUTPUT_LINE, - D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW, - D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW, -}; - -typedef struct DOMAIN_POINT -{ - float u; - float v; // for tri, w = 1 - u - v; -} DOMAIN_POINT; - -//================================================================================================================================= -// CHWTessellator: D3D11 Tessellation Fixed Function Hardware Reference -//================================================================================================================================= -typedef unsigned int FXP; // fixed point number - -class CHWTessellator -{ - -//--------------------------------------------------------------------------------------------------------------------------------- -public: - void Init( D3D11_TESSELLATOR_PARTITIONING partitioning, - D3D11_TESSELLATOR_OUTPUT_PRIMITIVE outputPrimitive); - - void TessellateIsoLineDomain( float TessFactor_V_LineDensity, - float TessFactor_U_LineDetail ); - - void TessellateTriDomain( float TessFactor_Ueq0, - float TessFactor_Veq0, - float TessFactor_Weq0, - float TessFactor_Inside ); - - void TessellateQuadDomain( float TessFactor_Ueq0, - float TessFactor_Veq0, - float TessFactor_Ueq1, - float TessFactor_Veq1, - float TessFactor_InsideU, - float TessFactor_InsideV ); - - int GetPointCount(); - int GetIndexCount(); - - DOMAIN_POINT* GetPoints(); // Get CHWTessellator owned pointer to vertices (UV values). - // Pointer is fixed for lifetime of CHWTessellator object. - int* GetIndices(); // Get CHWTessellator owned pointer to vertex indices. - // Pointer is fixed for lifetime of CHWTessellator object. - -#define ALLOW_XBOX_360_COMPARISON // Different vertex splitting order. This is NOT D3D11 behavior, just available here for comparison. - // Setting this define true just allows the XBox split style to be enabled via - // SetXBox360Mode() below, but by default this XBox360 mode still always starts off DISABLED. - // The XBox360 always splits from the center of an edge (D3D11 uses ruler function). Splitting - // from the center causes sliver triangles in transition areas, which cause numerous problems. - // Note the XBox360 only supports adaptive tessellation via fractional_even partitioning, - // though this #define lets you try the XBox vertex splitting order with any of the - // partitioning modes: even, odd, integer or pow2. -#ifdef ALLOW_XBOX_360_COMPARISON - void SetXBox360Mode(bool bXboxMode) {m_bXBox360Mode = bXboxMode;} -#endif - CHWTessellator(); - ~CHWTessellator(); -//--------------------------------------------------------------------------------------------------------------------------------- - //============================================================================================================================= - // Some defines so that numbers are usually self commenting - //============================================================================================================================= - static const int U = 0; // points on a tri patch - static const int V = 1; - static const int W = 2; - static const int Ueq0 = 0; // edges on a tri patch - static const int Veq0 = 1; - static const int Weq0 = 2; - - static const int Ueq1 = 2; // edges on a quad patch: Ueq0, Veq0, Ueq1, Veq1 - static const int Veq1 = 3; - - static const int QUAD_AXES = 2; - static const int QUAD_EDGES = 4; - static const int TRI_EDGES = 3; - //============================================================================================================================= - - enum TESSELLATOR_PARITY // derived from D3D11_TESSELLATOR_PARTITIONING - { // (note: for integer tessellation, both parities are used) - TESSELLATOR_PARITY_EVEN, - TESSELLATOR_PARITY_ODD - }; -private: - TESSELLATOR_PARITY m_originalParity; // user chosen parity - TESSELLATOR_PARITY m_parity; // current parity: if allowing mix of even/odd during discrete - // tessellation, this can vary from the user defined parity - D3D11_TESSELLATOR_PARTITIONING m_originalPartitioning; // user chosen partitioning - D3D11_TESSELLATOR_PARTITIONING m_partitioning; // current partitioning. IsoLines overrides for line density - D3D11_TESSELLATOR_OUTPUT_PRIMITIVE m_outputPrimitive; - DOMAIN_POINT* m_Point; // array where we will store u/v's for the points we generate - int* m_Index; // array where we will store index topology - int m_NumPoints; - int m_NumIndices; -#ifdef ALLOW_XBOX_360_COMPARISON - bool m_bXBox360Mode; -#endif - // PlacePointIn1D below is the workhorse for all position placement. - // It is code that could run as preamble in a Domain Shader, so the tessellator itself - // doesn't necessarily need to have floating point. - // Some per-TessFactor fixed context is needed, and that can be computed wherever - // the TessFactor reduction is done, perhaps as Hull Shader postamble - this is shared - // for all point evaluation. - typedef struct TESS_FACTOR_CONTEXT - { - FXP fxpInvNumSegmentsOnFloorTessFactor; - FXP fxpInvNumSegmentsOnCeilTessFactor; - FXP fxpHalfTessFactorFraction; - int numHalfTessFactorPoints; - int splitPointOnFloorHalfTessFactor; - } TESS_FACTOR_CONTEXT; - void ComputeTessFactorContext( FXP fxpTessFactor, TESS_FACTOR_CONTEXT& TessFactorCtx ); - void PlacePointIn1D( const TESS_FACTOR_CONTEXT& TessFactorCtx, int point, FXP& fxpLocation ); - - int NumPointsForTessFactor(FXP fxpTessFactor); - - // Tessellation parity control - bool Odd() {return (m_parity == TESSELLATOR_PARITY_ODD) ? true : false;} - void SetTessellationParity(TESSELLATOR_PARITY parity) {m_parity = parity;} - - // HWIntegerPartitioning() - hardware doesn't care about what pow2 partitioning is - the query below is true for - // both integer and pow2. - bool HWIntegerPartitioning() {return ((m_partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)|| - (m_partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)) ? true : false;} - - // Tesselation Partitioning control - void RestorePartitioning() {m_partitioning = m_originalPartitioning;}; - void OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING partitioning) {m_partitioning = partitioning;} //isoline uses this for density - - // Call these to generate new points and indices. Max TessFactor storage is already allocated. - int DefinePoint(FXP u, FXP v, int pointStorageOffset); - void DefineIndex(int index, int indexStorageOffset); - void DefineClockwiseTriangle(int index0, int index1, int index2, int indexStorageBaseOffset); - - // Couple of trivial ways to generate index data just given points and no other connectivity. - void DumpAllPoints(); // Make point indices for point rendering mode - - // redundant, but just here for orthogonality. - void DumpAllPointsAsInOrderLineList(); // A debug visualization of all the points connected - // in the order they were generated. - // Asking to draw line topology on a tri or quad patch will do this - - - // The structures below define the data that is derived given input TessFactors and which - // is used by point generation and connectivity generation steps (each of which are independent) - typedef struct PROCESSED_TESS_FACTORS_ISOLINE - { - TESSELLATOR_PARITY lineDensityParity; - TESSELLATOR_PARITY lineDetailParity; - TESS_FACTOR_CONTEXT lineDensityTessFactorCtx; - TESS_FACTOR_CONTEXT lineDetailTessFactorCtx; - bool bPatchCulled; - int numPointsPerLine; - int numLines; - } PROCESSED_TESS_FACTORS_ISOLINE; - typedef struct PROCESSED_TESS_FACTORS_TRI - { - FXP outsideTessFactor[TRI_EDGES]; - FXP insideTessFactor; - TESSELLATOR_PARITY outsideTessFactorParity[TRI_EDGES]; - TESSELLATOR_PARITY insideTessFactorParity; - TESS_FACTOR_CONTEXT outsideTessFactorCtx[TRI_EDGES]; - TESS_FACTOR_CONTEXT insideTessFactorCtx; - bool bJustDoMinimumTessFactor; - bool bPatchCulled; - // Stuff below is just specific to the traversal order - // this code happens to use to generate points/lines - int numPointsForOutsideEdge[TRI_EDGES]; - int numPointsForInsideTessFactor; - int insideEdgePointBaseOffset; - } PROCESSED_TESS_FACTORS_TRI; - typedef struct PROCESSED_TESS_FACTORS_QUAD - { - FXP outsideTessFactor[QUAD_EDGES]; - FXP insideTessFactor[QUAD_AXES]; - TESSELLATOR_PARITY outsideTessFactorParity[QUAD_EDGES]; - TESSELLATOR_PARITY insideTessFactorParity[QUAD_AXES]; - TESS_FACTOR_CONTEXT outsideTessFactorCtx[QUAD_EDGES]; - TESS_FACTOR_CONTEXT insideTessFactorCtx[QUAD_AXES]; - bool bJustDoMinimumTessFactor; - bool bPatchCulled; - // Stuff below is just specific to the traversal order - // this code happens to use to generate points/lines - int numPointsForOutsideEdge[QUAD_EDGES]; - int numPointsForInsideTessFactor[QUAD_AXES]; - int insideEdgePointBaseOffset; - } PROCESSED_TESS_FACTORS_QUAD; - - // These are the workhorse functions for tessellation: - // (1) Process input TessFactors - // (2) Generate points - // (3) Generate connectivity (can be done in parallel to (2)) - void IsoLineProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail, PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ); - void IsoLineGeneratePoints( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ); - void IsoLineGenerateConnectivity( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ); - void TriProcessTessFactors( float tessFactor_Ueq0, float TessFactor_Veq0, float TessFactor_Weq0, float insideTessFactor, PROCESSED_TESS_FACTORS_TRI& processedTessFactors ); - void TriGeneratePoints( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors ); - void TriGenerateConnectivity( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors ); - void QuadProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1, - float insideTessFactor_U, float insideTessFactor_V, PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ); - void QuadGeneratePoints( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ); - void QuadGenerateConnectivity( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ); - - // Stitching - // --------- - // Given pointers to the beginning of 2 parallel rows of points, and TessFactors for each, stitch them. - // The assumption is the stitch is symmetric. - void StitchTransition(int baseIndexOffset, int insideEdgePointBaseOffset, int insideNumHalfTessFactorPoints, - TESSELLATOR_PARITY insideEdgeTessFactorParity, - int outsideEdgePointBaseOffset, int outsideNumHalfTessFactorPoints, - TESSELLATOR_PARITY outsideEdgeTessFactorParity ); - // The interior can just use a simpler stitch. - enum DIAGONALS - { - DIAGONALS_INSIDE_TO_OUTSIDE, - DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE, - DIAGONALS_MIRRORED - }; - - void StitchRegular(bool bTrapezoid, DIAGONALS diagonals, int baseIndexOffset, int numInsideEdgePoints, - int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset); - -//--------------------------------------------------------------------------------------------------------------------------------- - // Index Patching - // -------------- - // The code below patches index values produces during triangulation, so triangulation doesn't have to know - // where points should go. I happened to never produce duplicate vertices, but the patching would - // be simpler if some duplicate vertices were introduced in practice. During point rendering mode however, - // it is not permitted for duplicate points to show up. - - // Since the points are generated in concentric rings, most of the time, the point locations are - // sequentially increasing in memory for each side of a ring, which the stitch can take advantage of. - // However, there are exceptions where the points are not sequentially increasing, such as - // the 4th row in a given ring, where the last point on the outside of each row is actually the beginning - // point. - // So we let the stitching code think it sees sequential vertices, and when it emits a vertex index, - // we patch it to be the real location. - int PatchIndexValue(int index); - typedef struct INDEX_PATCH_CONTEXT - { - int insidePointIndexDeltaToRealValue; - int insidePointIndexBadValue; - int insidePointIndexReplacementValue; - int outsidePointIndexPatchBase; - int outsidePointIndexDeltaToRealValue; - int outsidePointIndexBadValue; - int outsidePointIndexReplacementValue; - } INDEX_PATCH_CONTEXT; - void SetUsingPatchedIndices(bool bUsingPatchedIndices) {m_bUsingPatchedIndices = bUsingPatchedIndices;} - - // A second index patch we have to do handles the leftover strip of quads in the middle of an odd quad patch after - // finishing all the concentric rings. - // This also handles the leftover strip of points in the middle of an even quad - // patch, when stitching the row of triangles up the left side (V major quad) or bottom (U major quad) of the - // inner ring - typedef struct INDEX_PATCH_CONTEXT2 - { - int baseIndexToInvert; - int indexInversionEndPoint; - int cornerCaseBadValue; - int cornerCaseReplacementValue; - } INDEX_PATCH_CONTEXT2; - void SetUsingPatchedIndices2(bool bUsingPatchedIndices) {m_bUsingPatchedIndices2 = bUsingPatchedIndices;} - bool m_bUsingPatchedIndices; - bool m_bUsingPatchedIndices2; - INDEX_PATCH_CONTEXT m_IndexPatchContext; - INDEX_PATCH_CONTEXT2 m_IndexPatchContext2; - -}; - -//================================================================================================================================= -// CHLSLTessellator: D3D11 Tessellation HLSL Tessellator Interface -// Demonstrates TessFactor preconditioning code auto-generated by HLSL. Subject to change, but this -// just represents the effect of shader code the HLSL compiler will generate in the Hull Shader, -// so it does not affect hardware design at all. -//================================================================================================================================= -class CHLSLTessellator : public CHWTessellator -{ -public: - void Init( D3D11_TESSELLATOR_PARTITIONING partitioning, - D3D11_TESSELLATOR_REDUCTION insideTessFactorReduction, - D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS quadInsideTessFactorReductionAxis, - D3D11_TESSELLATOR_OUTPUT_PRIMITIVE outputPrimitive); - - void TessellateIsoLineDomain( float TessFactor_V_LineDensity, - float TessFactor_U_LineDetail ); - - void TessellateTriDomain( float tessFactor_Ueq0, - float TessFactor_Veq0, - float TessFactor_Weq0, - float insideTessFactorScale /*[0..1]*/ ); - - void TessellateQuadDomain( float TessFactorUeq0, - float TessFactorVeq0, - float TessFactorUeq1, - float TessFactorVeq1, - float insideTessFactorScaleU /*[0..1]*/, - float insideTessFactorScaleV /*[0..1]*/ ); - - int GetPointCount() {return CHWTessellator::GetPointCount();}; - int GetIndexCount() {return CHWTessellator::GetIndexCount();} - - DOMAIN_POINT* GetPoints() {return CHWTessellator::GetPoints();} // Get CHLSLTessellator owned pointer to vertices (UV values). - // Pointer is fixed for lifetime of CHLSLTessellator object. - int* GetIndices() {return CHWTessellator::GetIndices();} // Get CHLSLTessellator owned pointer to vertex indices. - // Pointer is fixed for lifetime of CHLSLTessellator object. - - // Retrieve TessFactors actually used by the "hardware" - // This includes clamping to valid range, and more interestingly - // if integer or pow2 partitioning is being done, the rounded TessFactors can be retrieved. - // Getting the rounded TessFactors can be useful for geomorphing of displacement maps. - float GetIsoLineDensityTessFactor() {return m_LastComputedTessFactors[0];} - float GetIsoLineDetailTessFactor() {return m_LastComputedTessFactors[1];} - float GetTriUeq0TessFactor() {return m_LastComputedTessFactors[0];} - float GetTriVeq0TessFactor() {return m_LastComputedTessFactors[1];} - float GetTriWeq0TessFactor() {return m_LastComputedTessFactors[2];} - float GetTriInsideTessFactor() {return m_LastComputedTessFactors[3];} - float GetQuadUeq0TessFactor() {return m_LastComputedTessFactors[0];} - float GetQuadVeq0TessFactor() {return m_LastComputedTessFactors[1];} - float GetQuadUeq1TessFactor() {return m_LastComputedTessFactors[2];} - float GetQuadVeq1TessFactor() {return m_LastComputedTessFactors[3];} - float GetQuadInsideUTessFactor() {return m_LastComputedTessFactors[4];} - float GetQuadInsideVTessFactor() {return m_LastComputedTessFactors[5];} - float GetUnRoundedIsoLineDensityTessFactor() {return m_LastUnRoundedComputedTessFactors[0];} - float GetUnRoundedIsoLineDetailTessFactor() {return m_LastUnRoundedComputedTessFactors[1];} - float GetUnRoundedTriUeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[0];} - float GetUnRoundedTriVeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[1];} - float GetUnRoundedTriWeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[2];} - float GetUnRoundedTriInsideTessFactor() {return m_LastUnRoundedComputedTessFactors[3];} - float GetUnRoundedQuadUeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[0];} - float GetUnRoundedQuadVeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[1];} - float GetUnRoundedQuadUeq1TessFactor() {return m_LastUnRoundedComputedTessFactors[2];} - float GetUnRoundedQuadVeq1TessFactor() {return m_LastUnRoundedComputedTessFactors[3];} - float GetUnRoundedQuadInsideUTessFactor() {return m_LastUnRoundedComputedTessFactors[4];} - float GetUnRoundedQuadInsideVTessFactor() {return m_LastUnRoundedComputedTessFactors[5];} - - CHLSLTessellator(); -//--------------------------------------------------------------------------------------------------------------------------------- -private: - TESSELLATOR_PARITY m_originalParity; // user chosen parity - TESSELLATOR_PARITY m_parity; // current parity: if allowing mix of even/odd during discrete - // tessellation, this can vary from the user defined parity - D3D11_TESSELLATOR_PARTITIONING m_originalPartitioning; // user chosen partitioning - D3D11_TESSELLATOR_PARTITIONING m_partitioning; // current partitioning. IsoLines overrides for line density - D3D11_TESSELLATOR_OUTPUT_PRIMITIVE m_outputPrimitive; - D3D11_TESSELLATOR_REDUCTION m_insideTessFactorReduction; - D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS m_quadInsideTessFactorReductionAxis; - float m_LastComputedTessFactors[6]; // TessFactors used for last tessellation - float m_LastUnRoundedComputedTessFactors[6]; // TessFactors used for last tessellation (before they were rounded) - bool IntegerPartitioning() {return (m_partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER) ? true : false;} - bool Pow2Partitioning() {return (m_partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)? true : false;} - void ClampTessFactor(float& TessFactor); - void RoundUpTessFactor(float& TessFactor); - void CleanupFloatTessFactor(float& input); // clamp float to [1.0f... +INF] (incl NaN->1.0f) - void ClampFloatTessFactorScale(float& input); // clamp float to [0.0f... +INF] (incl NaN->0.0f) - - // Tessellation parity control - bool Odd() {return (m_parity == TESSELLATOR_PARITY_ODD) ? true : false;} - void SetTessellationParity(TESSELLATOR_PARITY parity) {m_parity = parity;} - - // Tesselation Partitioning control - void RestorePartitioning() {m_partitioning = m_originalPartitioning;}; - void OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING partitioning) {m_partitioning = partitioning;} //isoline uses this for density - - void IsoLineHLSLProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail ); - void TriHLSLProcessTessFactors( float tessFactor_Ueq0, float TessFactor_Veq0, float TessFactor_Weq0, float insideTessFactor ); - void QuadHLSLProcessTessFactors( float TessFactor_Ueq0, float TessFactor_Veq0, float TessFactor_Ueq1, float TessFactor_Veq1, - float insideTessFactor_U, float insideTessFactor_V ); - -}; - diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp deleted file mode 100644 index 8d4104f0af1..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ /dev/null @@ -1,1423 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ****************************************************************************/ - -#include <stdio.h> -#include <thread> -#include <algorithm> -#include <float.h> -#include <vector> -#include <utility> -#include <fstream> -#include <string> - -#if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__) -#include <pthread.h> -#include <sched.h> -#include <unistd.h> -#endif - -#ifdef __APPLE__ -#include <sys/types.h> -#include <sys/sysctl.h> -#endif - -#include "common/os.h" -#include "core/api.h" -#include "context.h" -#include "frontend.h" -#include "backend.h" -#include "rasterizer.h" -#include "rdtsc_core.h" -#include "tilemgr.h" -#include "tileset.h" - - -// ThreadId -struct Core -{ - uint32_t procGroup = 0; - std::vector<uint32_t> threadIds; -}; - -struct NumaNode -{ - uint32_t numaId; - std::vector<Core> cores; -}; - -typedef std::vector<NumaNode> CPUNumaNodes; - -void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup) -{ - out_nodes.clear(); - out_numThreadsPerProcGroup = 0; - -#if defined(_WIN32) - - std::vector<KAFFINITY> threadMaskPerProcGroup; - - static std::mutex m; - std::lock_guard<std::mutex> l(m); - - DWORD bufSize = 0; - - BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize); - SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER); - - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = - (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize); - SWR_ASSERT(pBufferMem); - - ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize); - SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); - - uint32_t count = bufSize / pBufferMem->Size; - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem; - - for (uint32_t i = 0; i < count; ++i) - { - SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); - for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) - { - auto& gmask = pBuffer->Processor.GroupMask[g]; - uint32_t threadId = 0; - uint32_t procGroup = gmask.Group; - - Core* pCore = nullptr; - - while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) - { - // clear mask - KAFFINITY threadMask = KAFFINITY(1) << threadId; - gmask.Mask &= ~threadMask; - - if (procGroup >= threadMaskPerProcGroup.size()) - { - threadMaskPerProcGroup.resize(procGroup + 1); - } - - if (threadMaskPerProcGroup[procGroup] & threadMask) - { - // Already seen this mask. This means that we are in 32-bit mode and - // have seen more than 32 HW threads for this procGroup - // Don't use it -#if defined(_WIN64) - SWR_INVALID("Shouldn't get here in 64-bit mode"); -#endif - continue; - } - - threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId); - - // Find Numa Node - uint32_t numaId = 0; - PROCESSOR_NUMBER procNum = {}; - procNum.Group = WORD(procGroup); - procNum.Number = UCHAR(threadId); - - ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); - SWR_ASSERT(ret); - - // Store data - if (out_nodes.size() <= numaId) - { - out_nodes.resize(numaId + 1); - } - auto& numaNode = out_nodes[numaId]; - numaNode.numaId = numaId; - - if (nullptr == pCore) - { - numaNode.cores.push_back(Core()); - pCore = &numaNode.cores.back(); - pCore->procGroup = procGroup; - } - pCore->threadIds.push_back(threadId); - if (procGroup == 0) - { - out_numThreadsPerProcGroup++; - } - } - } - pBuffer = PtrAdd(pBuffer, pBuffer->Size); - } - - free(pBufferMem); - -#elif defined(__linux__) || defined(__gnu_linux__) - - // Parse /proc/cpuinfo to get full topology - std::ifstream input("/proc/cpuinfo"); - std::string line; - char* c; - uint32_t procId = uint32_t(-1); - uint32_t coreId = uint32_t(-1); - uint32_t physId = uint32_t(-1); - - while (std::getline(input, line)) - { - if (line.find("processor") != std::string::npos) - { - auto data_start = line.find(": ") + 2; - procId = std::strtoul(&line.c_str()[data_start], &c, 10); - continue; - } - if (line.find("core id") != std::string::npos) - { - auto data_start = line.find(": ") + 2; - coreId = std::strtoul(&line.c_str()[data_start], &c, 10); - continue; - } - if (line.find("physical id") != std::string::npos) - { - auto data_start = line.find(": ") + 2; - physId = std::strtoul(&line.c_str()[data_start], &c, 10); - continue; - } - if (line.length() == 0) - { - if (physId + 1 > out_nodes.size()) - out_nodes.resize(physId + 1); - auto& numaNode = out_nodes[physId]; - numaNode.numaId = physId; - - if (coreId + 1 > numaNode.cores.size()) - numaNode.cores.resize(coreId + 1); - auto& core = numaNode.cores[coreId]; - core.procGroup = coreId; - core.threadIds.push_back(procId); - } - } - - out_numThreadsPerProcGroup = 0; - for (auto& node : out_nodes) - { - for (auto& core : node.cores) - { - out_numThreadsPerProcGroup += core.threadIds.size(); - } - } - -#elif defined(__APPLE__) - - auto numProcessors = 0; - auto numCores = 0; - auto numPhysicalIds = 0; - - int value; - size_t size = sizeof(value); - - int result = sysctlbyname("hw.packages", &value, &size, NULL, 0); - SWR_ASSERT(result == 0); - numPhysicalIds = value; - - result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0); - SWR_ASSERT(result == 0); - numProcessors = value; - - result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0); - SWR_ASSERT(result == 0); - numCores = value; - - out_nodes.resize(numPhysicalIds); - - for (auto physId = 0; physId < numPhysicalIds; ++physId) - { - auto& numaNode = out_nodes[physId]; - auto procId = 0; - - numaNode.cores.resize(numCores); - - while (procId < numProcessors) - { - for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId) - { - auto& core = numaNode.cores[coreId]; - - core.procGroup = coreId; - core.threadIds.push_back(procId); - } - } - } - - out_numThreadsPerProcGroup = 0; - - for (auto& node : out_nodes) - { - for (auto& core : node.cores) - { - out_numThreadsPerProcGroup += core.threadIds.size(); - } - } - -#else - -#error Unsupported platform - -#endif - - // Prune empty cores and numa nodes - for (auto node_it = out_nodes.begin(); node_it != out_nodes.end();) - { - // Erase empty cores (first) - for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end();) - { - if (core_it->threadIds.size() == 0) - { - core_it = node_it->cores.erase(core_it); - } - else - { - ++core_it; - } - } - - // Erase empty numa nodes (second) - if (node_it->cores.size() == 0) - { - node_it = out_nodes.erase(node_it); - } - else - { - ++node_it; - } - } -} - -void bindThread(SWR_CONTEXT* pContext, - uint32_t threadId, - uint32_t procGroupId = 0, - bool bindProcGroup = false) -{ - // Only bind threads when MAX_WORKER_THREADS isn't set. - if (pContext->threadInfo.SINGLE_THREADED || - (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false)) - { - return; - } - -#if defined(_WIN32) - - GROUP_AFFINITY affinity = {}; - affinity.Group = procGroupId; - -#if !defined(_WIN64) - if (threadId >= 32) - { - // Hopefully we don't get here. Logic in CreateThreadPool should prevent this. - SWR_INVALID("Shouldn't get here"); - - // In a 32-bit process on Windows it is impossible to bind - // to logical processors 32-63 within a processor group. - // In this case set the mask to 0 and let the system assign - // the processor. Hopefully it will make smart choices. - affinity.Mask = 0; - } - else -#endif - { - // If MAX_WORKER_THREADS is set, only bind to the proc group, - // Not the individual HW thread. - if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS) - { - affinity.Mask = KAFFINITY(1) << threadId; - } - else - { - affinity.Mask = KAFFINITY(0); - } - } - - if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr)) - { - SWR_INVALID("Failed to set Thread Affinity"); - } - -#elif defined(__linux__) || defined(__gnu_linux__) - - cpu_set_t cpuset; - pthread_t thread = pthread_self(); - CPU_ZERO(&cpuset); - CPU_SET(threadId, &cpuset); - - int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); - if (err != 0) - { - fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err)); - } - -#endif -} - -INLINE -uint32_t GetEnqueuedDraw(SWR_CONTEXT* pContext) -{ - return pContext->dcRing.GetHead(); -} - -INLINE -DRAW_CONTEXT* GetDC(SWR_CONTEXT* pContext, uint32_t drawId) -{ - return &pContext->dcRing[(drawId - 1) % pContext->MAX_DRAWS_IN_FLIGHT]; -} - -INLINE -bool IDComparesLess(uint32_t a, uint32_t b) -{ - // Use signed delta to ensure that wrap-around to 0 is correctly handled. - int32_t delta = int32_t(a - b); - return (delta < 0); -} - -// returns true if dependency not met -INLINE -bool CheckDependency(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw) -{ - return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1); -} - -bool CheckDependencyFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw) -{ - return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Update client stats. -INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) -{ - if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false)) - { - return; - } - - DRAW_DYNAMIC_STATE& dynState = pDC->dynState; - OSALIGNLINE(SWR_STATS) stats{0}; - - // Sum up stats across all workers before sending to client. - for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) - { - stats.DepthPassCount += dynState.pStats[i].DepthPassCount; - stats.PsInvocations += dynState.pStats[i].PsInvocations; - stats.CsInvocations += dynState.pStats[i].CsInvocations; - - } - - - pContext->pfnUpdateStats(GetPrivateState(pDC), &stats); -} - -INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) -{ - UpdateClientStats(pContext, workerId, pDC); - - if (pDC->retireCallback.pfnCallbackFunc) - { - pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData, - pDC->retireCallback.userData2, - pDC->retireCallback.userData3); - - // Callbacks to external code *could* change floating point control state - // Reset our optimal flags - SetOptimalVectorCSR(); - } -} - -// inlined-only version -INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) -{ - int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone)); - SWR_ASSERT(result >= 0); - - AR_FLUSH(pDC->drawId); - - if (result == 0) - { - ExecuteCallbacks(pContext, workerId, pDC); - - - // Cleanup memory allocations - pDC->pArena->Reset(true); - if (!pDC->isCompute) - { - pDC->pTileMgr->initialize(); - } - if (pDC->cleanupState) - { - pDC->pState->pArena->Reset(true); - } - - _ReadWriteBarrier(); - - pContext->dcRing.Dequeue(); // Remove from tail - } - - return result; -} - -// available to other translation modules -int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) -{ - return CompleteDrawContextInl(pContext, 0, pDC); -} - -INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, - uint32_t workerId, - uint32_t& curDrawBE, - uint32_t& drawEnqueued) -{ - // increment our current draw id to the first incomplete draw - drawEnqueued = GetEnqueuedDraw(pContext); - while (IDComparesLess(curDrawBE, drawEnqueued)) - { - DRAW_CONTEXT* pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT]; - - // If its not compute and FE is not done then break out of loop. - if (!pDC->doneFE && !pDC->isCompute) - break; - - bool isWorkComplete = - pDC->isCompute ? pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete(); - - if (isWorkComplete) - { - curDrawBE++; - CompleteDrawContextInl(pContext, workerId, pDC); - } - else - { - break; - } - } - - // If there are no more incomplete draws then return false. - return IDComparesLess(curDrawBE, drawEnqueued); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief If there is any BE work then go work on it. -/// @param pContext - pointer to SWR context. -/// @param workerId - The unique worker ID that is assigned to this thread. -/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker -/// thread -/// has its own curDrawBE counter and this ensures that each worker processes all -/// the draws in order. -/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its -/// own set and each time it fails to lock a macrotile, because its already -/// locked, then it will add that tile to the lockedTiles set. As a worker -/// begins to work on future draws the lockedTiles ensure that it doesn't work -/// on tiles that may still have work pending in a previous draw. Additionally, -/// the lockedTiles is heuristic that can steer a worker back to the same -/// macrotile that it had been working on in a previous draw. -/// @returns true if worker thread should shutdown -bool WorkOnFifoBE(SWR_CONTEXT* pContext, - uint32_t workerId, - uint32_t& curDrawBE, - TileSet& lockedTiles, - uint32_t numaNode, - uint32_t numaMask) -{ - bool bShutdown = false; - - // Find the first incomplete draw that has pending work. If no such draw is found then - // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. - uint32_t drawEnqueued = 0; - if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false) - { - return false; - } - - uint32_t lastRetiredDraw = - pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1; - - // Reset our history for locked tiles. We'll have to re-learn which tiles are locked. - lockedTiles.clear(); - - // Try to work on each draw in order of the available draws in flight. - // 1. If we're on curDrawBE, we can work on any macrotile that is available. - // 2. If we're trying to work on draws after curDrawBE, we are restricted to - // working on those macrotiles that are known to be complete in the prior draw to - // maintain order. The locked tiles provides the history to ensures this. - for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) - { - DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT]; - - if (pDC->isCompute) - return false; // We don't look at compute work. - - // First wait for FE to be finished with this draw. This keeps threading model simple - // but if there are lots of bubbles between draws then serializing FE and BE may - // need to be revisited. - if (!pDC->doneFE) - return false; - - // If this draw is dependent on a previous draw then we need to bail. - if (CheckDependency(pContext, pDC, lastRetiredDraw)) - { - return false; - } - - // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it. - auto& macroTiles = pDC->pTileMgr->getDirtyTiles(); - - for (auto tile : macroTiles) - { - uint32_t tileID = tile->mId; - - // Only work on tiles for this numa node - uint32_t x, y; - pDC->pTileMgr->getTileIndices(tileID, x, y); - if (((x ^ y) & numaMask) != numaNode) - { - _mm_pause(); - continue; - } - - if (!tile->getNumQueued()) - { - _mm_pause(); - continue; - } - - // can only work on this draw if it's not in use by other threads - if (lockedTiles.get(tileID)) - { - _mm_pause(); - continue; - } - - if (tile->tryLock()) - { - BE_WORK* pWork; - - RDTSC_BEGIN(pContext->pBucketMgr, WorkerFoundWork, pDC->drawId); - - uint32_t numWorkItems = tile->getNumQueued(); - SWR_ASSERT(numWorkItems); - - pWork = tile->peek(); - SWR_ASSERT(pWork); - if (pWork->type == DRAW) - { - pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID); - } - else if (pWork->type == SHUTDOWN) - { - bShutdown = true; - } - - while ((pWork = tile->peek()) != nullptr) - { - pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); - tile->dequeue(); - } - RDTSC_END(pContext->pBucketMgr, WorkerFoundWork, numWorkItems); - - _ReadWriteBarrier(); - - pDC->pTileMgr->markTileComplete(tileID); - - // Optimization: If the draw is complete and we're the last one to have worked on it - // then we can reset the locked list as we know that all previous draws before the - // next are guaranteed to be complete. - if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete())) - { - // We can increment the current BE and safely move to next draw since we know - // this draw is complete. - curDrawBE++; - CompleteDrawContextInl(pContext, workerId, pDC); - - lastRetiredDraw++; - - lockedTiles.clear(); - break; - } - - if (bShutdown) - { - break; - } - } - else - { - // This tile is already locked. So let's add it to our locked tiles set. This way we - // don't try locking this one again. - lockedTiles.set(tileID); - _mm_pause(); - } - } - } - - return bShutdown; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Called when FE work is complete for this DC. -INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) -{ - if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE) - { - SWR_STATS_FE& stats = pDC->dynState.statsFE; - - AR_EVENT(FrontendStatsEvent(pDC->drawId, - stats.IaVertices, - stats.IaPrimitives, - stats.VsInvocations, - stats.HsInvocations, - stats.DsInvocations, - stats.GsInvocations, - stats.GsPrimitives, - stats.CInvocations, - stats.CPrimitives, - stats.SoPrimStorageNeeded[0], - stats.SoPrimStorageNeeded[1], - stats.SoPrimStorageNeeded[2], - stats.SoPrimStorageNeeded[3], - stats.SoNumPrimsWritten[0], - stats.SoNumPrimsWritten[1], - stats.SoNumPrimsWritten[2], - stats.SoNumPrimsWritten[3])); - AR_EVENT(FrontendDrawEndEvent(pDC->drawId)); - - pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats); - } - - if (pContext->pfnUpdateSoWriteOffset) - { - for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i) - { - if ((pDC->dynState.SoWriteOffsetDirty[i]) && - (pDC->pState->state.soBuffer[i].soWriteEnable)) - { - pContext->pfnUpdateSoWriteOffset( - GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]); - } - } - } - - if (pContext->pfnUpdateStreamOut) - pContext->pfnUpdateStreamOut(GetPrivateState(pDC), pDC->dynState.soPrims); - - // Ensure all streaming writes are globally visible before marking this FE done - _mm_mfence(); - pDC->doneFE = true; - - InterlockedDecrement(&pContext->drawsOutstandingFE); -} - -void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE) -{ - // Try to grab the next DC from the ring - uint32_t drawEnqueued = GetEnqueuedDraw(pContext); - while (IDComparesLess(curDrawFE, drawEnqueued)) - { - uint32_t dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT; - DRAW_CONTEXT* pDC = &pContext->dcRing[dcSlot]; - if (pDC->isCompute || pDC->doneFE) - { - CompleteDrawContextInl(pContext, workerId, pDC); - curDrawFE++; - } - else - { - break; - } - } - - uint32_t lastRetiredFE = curDrawFE - 1; - uint32_t curDraw = curDrawFE; - while (IDComparesLess(curDraw, drawEnqueued)) - { - uint32_t dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT; - DRAW_CONTEXT* pDC = &pContext->dcRing[dcSlot]; - - if (!pDC->FeLock && !pDC->isCompute) - { - if (CheckDependencyFE(pContext, pDC, lastRetiredFE)) - { - return; - } - - uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0); - if (initial == 0) - { - // successfully grabbed the DC, now run the FE - pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc); - - CompleteDrawFE(pContext, workerId, pDC); - } - else - { - _mm_pause(); - } - } - else - { - _mm_pause(); - } - - curDraw++; - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief If there is any compute work then go work on it. -/// @param pContext - pointer to SWR context. -/// @param workerId - The unique worker ID that is assigned to this thread. -/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker -/// thread -/// has its own curDrawBE counter and this ensures that each worker processes all -/// the draws in order. -void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE) -{ - uint32_t drawEnqueued = 0; - if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false) - { - return; - } - - uint32_t lastRetiredDraw = - pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1; - - for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) - { - DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT]; - if (pDC->isCompute == false) - return; - - // check dependencies - if (CheckDependency(pContext, pDC, lastRetiredDraw)) - { - return; - } - - SWR_ASSERT(pDC->pDispatch != nullptr); - DispatchQueue& queue = *pDC->pDispatch; - - // Is there any work remaining? - if (queue.getNumQueued() > 0) - { - void* pSpillFillBuffer = nullptr; - void* pScratchSpace = nullptr; - uint32_t threadGroupId = 0; - while (queue.getWork(threadGroupId)) - { - queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace); - queue.finishedWork(); - } - - // Ensure all streaming writes are globally visible before moving onto the next draw - _mm_mfence(); - } - } -} - -void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId) -{ - if (nullptr == pContext) - { - return; - } - - if (apiThreadId >= pContext->threadPool.numReservedThreads) - { - if (pContext->threadPool.numReservedThreads) - { - const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[0]; - // Just bind to the process group used for API thread 0 - bindThread(pContext, 0, threadData.procGroupId, true); - } - return; - } - - const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[apiThreadId]; - - bindThread( - pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup); -} - -template <bool IsFEThread, bool IsBEThread> -DWORD workerThreadMain(LPVOID pData) -{ - THREAD_DATA* pThreadData = (THREAD_DATA*)pData; - SWR_CONTEXT* pContext = pThreadData->pContext; - uint32_t threadId = pThreadData->threadId; - uint32_t workerId = pThreadData->workerId; - - bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); - - { - char threadName[64]; - sprintf_s(threadName, -#if defined(_WIN32) - "SWRWorker_%02d_NUMA%d_Core%02d_T%d", -#else - // linux pthread name limited to 16 chars (including \0) - "w%03d-n%d-c%03d-t%d", -#endif - workerId, - pThreadData->numaId, - pThreadData->coreId, - pThreadData->htId); - SetCurrentThreadName(threadName); - } - - RDTSC_INIT(pContext->pBucketMgr, threadId); - - // Only need offset numa index from base for correct masking - uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE; - uint32_t numaMask = pContext->threadPool.numaMask; - - SetOptimalVectorCSR(); - - // Track tiles locked by other threads. If we try to lock a macrotile and find its already - // locked then we'll add it to this list so that we don't try and lock it again. - TileSet lockedTiles; - - // each worker has the ability to work on any of the queued draws as long as certain - // conditions are met. the data associated - // with a draw is guaranteed to be active as long as a worker hasn't signaled that he - // has moved on to the next draw when he determines there is no more work to do. The api - // thread will not increment the head of the dc ring until all workers have moved past the - // current head. - // the logic to determine what to work on is: - // 1- try to work on the FE any draw that is queued. For now there are no dependencies - // on the FE work, so any worker can grab any FE and process in parallel. Eventually - // we'll need dependency tracking to force serialization on FEs. The worker will try - // to pick an FE by atomically incrementing a counter in the swr context. he'll keep - // trying until he reaches the tail. - // 2- BE work must be done in strict order. we accomplish this today by pulling work off - // the oldest draw (ie the head) of the dcRing. the worker can determine if there is - // any work left by comparing the total # of binned work items and the total # of completed - // work items. If they are equal, then there is no more work to do for this draw, and - // the worker can safely increment its oldestDraw counter and move on to the next draw. - std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); - - auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; - - uint32_t curDrawBE = 0; - uint32_t curDrawFE = 0; - - bool bShutdown = false; - - while (true) - { - if (bShutdown && !threadHasWork(curDrawBE)) - { - break; - } - - uint32_t loop = 0; - while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE)) - { - _mm_pause(); - } - - if (!threadHasWork(curDrawBE)) - { - lock.lock(); - - // check for thread idle condition again under lock - if (threadHasWork(curDrawBE)) - { - lock.unlock(); - continue; - } - - pContext->FifosNotEmpty.wait(lock); - lock.unlock(); - } - - if (IsBEThread) - { - RDTSC_BEGIN(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0); - bShutdown |= - WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); - RDTSC_END(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0); - - WorkOnCompute(pContext, workerId, curDrawBE); - } - - if (IsFEThread) - { - WorkOnFifoFE(pContext, workerId, curDrawFE); - - if (!IsBEThread) - { - curDrawBE = curDrawFE; - } - } - } - - return 0; -} -template <> -DWORD workerThreadMain<false, false>(LPVOID) = delete; - -template <bool IsFEThread, bool IsBEThread> -DWORD workerThreadInit(LPVOID pData) -{ -#if defined(_MSC_VER) - __try -#endif // _WIN32 - { - return workerThreadMain<IsFEThread, IsBEThread>(pData); - } - -#if defined(_MSC_VER) - __except (EXCEPTION_CONTINUE_SEARCH) - { - } - -#endif // _WIN32 - - return 1; -} -template <> -DWORD workerThreadInit<false, false>(LPVOID pData) = delete; - -static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads) -{ - // Initialize DRAW_CONTEXT's per-thread stats - for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc) - { - pContext->dcRing[dc].dynState.pStats = - (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64); - memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Creates thread pool info but doesn't launch threads. -/// @param pContext - pointer to context -/// @param pPool - pointer to thread pool object. -void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) -{ - CPUNumaNodes nodes; - uint32_t numThreadsPerProcGroup = 0; - CalculateProcessorTopology(nodes, numThreadsPerProcGroup); - assert(numThreadsPerProcGroup > 0); - - // Assumption, for asymmetric topologies, multi-threaded cores will appear - // in the list before single-threaded cores. This appears to be true for - // Windows when the total HW threads is limited to 64. - uint32_t numHWNodes = (uint32_t)nodes.size(); - uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); - uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); - -#if defined(_WIN32) && !defined(_WIN64) - if (!pContext->threadInfo.MAX_WORKER_THREADS) - { - // Limit 32-bit windows to bindable HW threads only - if ((numHWCoresPerNode * numHWHyperThreads) > 32) - { - numHWCoresPerNode = 32 / numHWHyperThreads; - } - } -#endif - - // Calculate num HW threads. Due to asymmetric topologies, this is not - // a trivial multiplication. - uint32_t numHWThreads = 0; - for (auto const& node : nodes) - { - for (auto const& core : node.cores) - { - numHWThreads += (uint32_t)core.threadIds.size(); - } - } - - uint32_t numNodes = numHWNodes; - uint32_t numCoresPerNode = numHWCoresPerNode; - uint32_t numHyperThreads = numHWHyperThreads; - - // Calc used threads per-core - if (numHyperThreads > pContext->threadInfo.BASE_THREAD) - { - numHyperThreads -= pContext->threadInfo.BASE_THREAD; - } - else - { - SWR_ASSERT(false, - "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0", - pContext->threadInfo.BASE_THREAD, - numHyperThreads); - pContext->threadInfo.BASE_THREAD = 0; - } - - if (pContext->threadInfo.MAX_THREADS_PER_CORE) - { - numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE); - } - - // Prune any cores that don't support the number of threads - if (numHyperThreads > 1) - { - for (auto& node : nodes) - { - uint32_t numUsableCores = 0; - for (auto& core : node.cores) - { - numUsableCores += (core.threadIds.size() >= numHyperThreads); - } - numCoresPerNode = std::min(numCoresPerNode, numUsableCores); - } - } - - // Calc used cores per NUMA node - if (numCoresPerNode > pContext->threadInfo.BASE_CORE) - { - numCoresPerNode -= pContext->threadInfo.BASE_CORE; - } - else - { - SWR_ASSERT(false, - "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0", - pContext->threadInfo.BASE_CORE, - numCoresPerNode); - pContext->threadInfo.BASE_CORE = 0; - } - - if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE) - { - numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE); - } - - // Calc used NUMA nodes - if (numNodes > pContext->threadInfo.BASE_NUMA_NODE) - { - numNodes -= pContext->threadInfo.BASE_NUMA_NODE; - } - else - { - SWR_ASSERT( - false, - "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0", - pContext->threadInfo.BASE_NUMA_NODE, - numNodes); - pContext->threadInfo.BASE_NUMA_NODE = 0; - } - - if (pContext->threadInfo.MAX_NUMA_NODES) - { - numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES); - } - - // Calculate numThreads - at this point everything should be symmetric - uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; - SWR_REL_ASSERT(numThreads <= numHWThreads); - - uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads; - uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore; - uint32_t numRemovedThreads = 0; - - if (pContext->threadInfo.SINGLE_THREADED) - { - numAPIReservedThreads = 0; - numThreads = 1; - pContext->NumWorkerThreads = 1; - pContext->NumFEThreads = 1; - pContext->NumBEThreads = 1; - pPool->numThreads = 0; - } - else if (pContext->threadInfo.MAX_WORKER_THREADS) - { - numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads); - pContext->threadInfo.BASE_NUMA_NODE = 0; - pContext->threadInfo.BASE_CORE = 0; - pContext->threadInfo.BASE_THREAD = 0; - numAPIReservedThreads = 0; - } - else - { - if (numAPIReservedThreads >= numThreads) - { - numAPIReservedThreads = 0; - } - else if (numAPIReservedThreads) - { - numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads); - - if (0 == numAPIThreadsPerCore) - { - numAPIThreadsPerCore = numHWHyperThreads; - } - - numRemovedThreads = numAPIReservedThreads; - if (numAPIThreadsPerCore == 2 && numHyperThreads == 1) - { - // Adjust removed threads to make logic below work - numRemovedThreads = - std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2); - } - - numThreads -= numRemovedThreads; - } - } - - InitPerThreadStats(pContext, numThreads); - - if (pContext->threadInfo.SINGLE_THREADED) - { - numAPIReservedThreads = 0; - numThreads = 1; - } - - if (numAPIReservedThreads) - { - pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads]; - SWR_ASSERT(pPool->pApiThreadData); - if (!pPool->pApiThreadData) - { - numAPIReservedThreads = 0; - } - else - { - memset(pPool->pApiThreadData, 0, sizeof(THREAD_DATA) * numAPIReservedThreads); - } - } - pPool->numReservedThreads = numAPIReservedThreads; - - pPool->numThreads = numThreads; - pContext->NumWorkerThreads = pPool->numThreads; - - pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads]; - assert(pPool->pThreadData); - memset(pPool->pThreadData, 0, sizeof(THREAD_DATA) * pPool->numThreads); - pPool->numaMask = 0; - - // Allocate worker private data - pPool->pWorkerPrivateDataArray = nullptr; - if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0) - { - pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA); - pContext->workerPrivateState.pfnInitWorkerData = nullptr; - pContext->workerPrivateState.pfnFinishWorkerData = nullptr; - } - - // initialize contents of SWR_WORKER_DATA - size_t perWorkerSize = - AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64); - size_t totalSize = perWorkerSize * pPool->numThreads; - if (totalSize) - { - pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64); - SWR_ASSERT(pPool->pWorkerPrivateDataArray); - - void* pWorkerData = pPool->pWorkerPrivateDataArray; - for (uint32_t i = 0; i < pPool->numThreads; ++i) - { - pPool->pThreadData[i].pWorkerPrivateData = pWorkerData; - if (pContext->workerPrivateState.pfnInitWorkerData) - { - pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i); - } - pWorkerData = PtrAdd(pWorkerData, perWorkerSize); - } - } - - if (pContext->threadInfo.SINGLE_THREADED) - { - return; - } - - pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads]; - assert(pPool->pThreads); - - if (pContext->threadInfo.MAX_WORKER_THREADS) - { - bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup); - uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup; - // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads - // But Windows will still require binding to specific process groups - for (uint32_t workerId = 0; workerId < numThreads; ++workerId) - { - pPool->pThreadData[workerId].workerId = workerId; - pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups; - pPool->pThreadData[workerId].threadId = 0; - pPool->pThreadData[workerId].numaId = 0; - pPool->pThreadData[workerId].coreId = 0; - pPool->pThreadData[workerId].htId = 0; - pPool->pThreadData[workerId].pContext = pContext; - pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup; - - pContext->NumBEThreads++; - pContext->NumFEThreads++; - } - } - else - { - // numa distribution assumes workers on all nodes - bool useNuma = true; - if (numCoresPerNode * numHyperThreads == 1) - { - useNuma = false; - } - - if (useNuma) - { - pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.) - } - else - { - pPool->numaMask = 0; - } - - uint32_t workerId = 0; - uint32_t numReservedThreads = numAPIReservedThreads; - for (uint32_t n = 0; n < numNodes; ++n) - { - if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size()) - { - break; - } - auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE]; - uint32_t numCores = numCoresPerNode; - for (uint32_t c = 0; c < numCores; ++c) - { - if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size()) - { - break; - } - - auto& core = node.cores[c + pContext->threadInfo.BASE_CORE]; - for (uint32_t t = 0; t < numHyperThreads; ++t) - { - if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size()) - { - break; - } - - if (numRemovedThreads) - { - --numRemovedThreads; - assert(numReservedThreads); - --numReservedThreads; - pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU; - pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup; - pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t]; - pPool->pApiThreadData[numReservedThreads].numaId = - useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; - pPool->pApiThreadData[numReservedThreads].coreId = - c + pContext->threadInfo.BASE_CORE; - pPool->pApiThreadData[numReservedThreads].htId = - t + pContext->threadInfo.BASE_THREAD; - pPool->pApiThreadData[numReservedThreads].pContext = pContext; - pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false; - - if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads) - { - --numReservedThreads; - pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU; - pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup; - pPool->pApiThreadData[numReservedThreads].threadId = - core.threadIds[t + 1]; - pPool->pApiThreadData[numReservedThreads].numaId = - useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; - pPool->pApiThreadData[numReservedThreads].coreId = - c + pContext->threadInfo.BASE_CORE; - pPool->pApiThreadData[numReservedThreads].htId = - t + pContext->threadInfo.BASE_THREAD; - pPool->pApiThreadData[numReservedThreads].pContext = pContext; - pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false; - } - - continue; - } - - SWR_ASSERT(workerId < numThreads); - - pPool->pThreadData[workerId].workerId = workerId; - pPool->pThreadData[workerId].procGroupId = core.procGroup; - pPool->pThreadData[workerId].threadId = - core.threadIds[t + pContext->threadInfo.BASE_THREAD]; - pPool->pThreadData[workerId].numaId = - useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; - pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE; - pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD; - pPool->pThreadData[workerId].pContext = pContext; - pPool->pThreadData[workerId].forceBindProcGroup = false; - - pContext->NumBEThreads++; - pContext->NumFEThreads++; - - ++workerId; - } - } - } - SWR_ASSERT(workerId == pContext->NumWorkerThreads); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Launches worker threads in thread pool. -/// @param pContext - pointer to context -/// @param pPool - pointer to thread pool object. -void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) -{ - if (pContext->threadInfo.SINGLE_THREADED) - { - return; - } - - for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId) - { - pPool->pThreads[workerId] = - new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Destroys thread pool. -/// @param pContext - pointer to context -/// @param pPool - pointer to thread pool object. -void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) -{ - // Wait for all threads to finish - SwrWaitForIdle(pContext); - - // Wait for threads to finish and destroy them - for (uint32_t t = 0; t < pPool->numThreads; ++t) - { - if (!pContext->threadInfo.SINGLE_THREADED) - { - // Detach from thread. Cannot join() due to possibility (in Windows) of code - // in some DLLMain(THREAD_DETACH case) blocking the thread until after this returns. - pPool->pThreads[t]->detach(); - delete (pPool->pThreads[t]); - } - - if (pContext->workerPrivateState.pfnFinishWorkerData) - { - pContext->workerPrivateState.pfnFinishWorkerData( - pContext, pPool->pThreadData[t].pWorkerPrivateData, t); - } - } - - delete[] pPool->pThreads; - - // Clean up data used by threads - delete[] pPool->pThreadData; - delete[] pPool->pApiThreadData; - - AlignedFree(pPool->pWorkerPrivateDataArray); -} diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h deleted file mode 100644 index 3072bbc835d..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ /dev/null @@ -1,82 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file threads.h - * - * @brief Definitions for SWR threading model. - * - ******************************************************************************/ -#pragma once - -#include "knobs.h" - -#include <unordered_set> -#include <thread> -typedef std::thread* THREAD_PTR; - -struct SWR_CONTEXT; -struct DRAW_CONTEXT; -struct SWR_WORKER_PRIVATE_STATE; - -struct THREAD_DATA -{ - void* pWorkerPrivateData; // Pointer to per-worker private data - uint32_t procGroupId; // Will always be 0 for non-Windows OS - uint32_t threadId; // within the procGroup for Windows - uint32_t numaId; // NUMA node id - uint32_t coreId; // Core id - uint32_t htId; // Hyperthread id - uint32_t workerId; // index of worker in total thread data - void* clipperData; // pointer to hang clipper-private data on - SWR_CONTEXT* pContext; - bool forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set. -}; - -struct THREAD_POOL -{ - THREAD_PTR* pThreads; - uint32_t numThreads; - uint32_t numaMask; - THREAD_DATA* pThreadData; - void* pWorkerPrivateDataArray; // All memory for worker private data - uint32_t numReservedThreads; // Number of threads reserved for API use - THREAD_DATA* pApiThreadData; -}; - -struct TileSet; - -void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool); -void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool); -void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool); - -// Expose FE and BE worker functions to the API thread if single threaded -void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE); -bool WorkOnFifoBE(SWR_CONTEXT* pContext, - uint32_t workerId, - uint32_t& curDrawBE, - TileSet& usedTiles, - uint32_t numaNode, - uint32_t numaMask); -void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE); -int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC); - -void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId); diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp deleted file mode 100644 index a02fa336277..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ /dev/null @@ -1,454 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file tilemgr.cpp - * - * @brief Implementation for Macro Tile Manager which provides the facilities - * for threads to work on an macro tile. - * - ******************************************************************************/ -#include <unordered_map> - -#include "fifo.hpp" -#include "core/tilemgr.h" -#include "core/multisample.h" -#include "rdtsc_core.h" - -MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) {} - -void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK* pWork) -{ - // Should not enqueue more then what we have backing for in the hot tile manager. - SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); - SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); - - if ((x & ~(KNOB_NUM_HOT_TILES_X - 1)) | (y & ~(KNOB_NUM_HOT_TILES_Y - 1))) - { - return; - } - - uint32_t id = getTileId(x, y); - - if (id >= mTiles.size()) - { - mTiles.resize((16 + id) * 2); - } - - MacroTileQueue* pTile = mTiles[id]; - if (!pTile) - { - pTile = mTiles[id] = new MacroTileQueue(); - } - pTile->mWorkItemsFE++; - pTile->mId = id; - - if (pTile->mWorkItemsFE == 1) - { - pTile->clear(mArena); - mDirtyTiles.push_back(pTile); - } - - mWorkItemsProduced++; - pTile->enqueue_try_nosync(mArena, pWork); -} - -void MacroTileMgr::markTileComplete(uint32_t id) -{ - SWR_ASSERT(mTiles.size() > id); - MacroTileQueue& tile = *mTiles[id]; - uint32_t numTiles = tile.mWorkItemsFE; - InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles); - - _ReadWriteBarrier(); - tile.mWorkItemsBE += numTiles; - SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE); - - // clear out tile, but defer fifo clear until the next DC first queues to it. - // this prevents worker threads from constantly locking a completed macro tile - tile.mWorkItemsFE = 0; - tile.mWorkItemsBE = 0; -} - -HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, - DRAW_CONTEXT* pDC, - HANDLE hWorkerPrivateData, - uint32_t macroID, - SWR_RENDERTARGET_ATTACHMENT attachment, - bool create, - uint32_t numSamples, - uint32_t renderTargetArrayIndex) -{ - uint32_t x, y; - MacroTileMgr::getTileIndices(macroID, x, y); - - SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); - SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); - - HotTileSet& tile = mHotTiles[x][y]; - HOTTILE& hotTile = tile.Attachment[attachment]; - if (hotTile.pBuffer == NULL) - { - if (create) - { - uint32_t size = numSamples * mHotTileSize[attachment]; - uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); - hotTile.pBuffer = - (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); - hotTile.state = HOTTILE_INVALID; - hotTile.numSamples = numSamples; - hotTile.renderTargetArrayIndex = renderTargetArrayIndex; - } - else - { - return NULL; - } - } - else - { - // free the old tile and create a new one with enough space to hold all samples - if (numSamples > hotTile.numSamples) - { - // tile should be either uninitialized or resolved if we're deleting and switching to a - // new sample count - SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) || - (hotTile.state == HOTTILE_CLEAR)); - FreeHotTileMem(hotTile.pBuffer); - - uint32_t size = numSamples * mHotTileSize[attachment]; - uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); - hotTile.pBuffer = - (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); - hotTile.state = HOTTILE_INVALID; - hotTile.numSamples = numSamples; - } - - // if requested render target array index isn't currently loaded, need to store out the - // current hottile and load the requested array slice - if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) - { - SWR_FORMAT format; - switch (attachment) - { - case SWR_ATTACHMENT_COLOR0: - case SWR_ATTACHMENT_COLOR1: - case SWR_ATTACHMENT_COLOR2: - case SWR_ATTACHMENT_COLOR3: - case SWR_ATTACHMENT_COLOR4: - case SWR_ATTACHMENT_COLOR5: - case SWR_ATTACHMENT_COLOR6: - case SWR_ATTACHMENT_COLOR7: - format = KNOB_COLOR_HOT_TILE_FORMAT; - break; - case SWR_ATTACHMENT_DEPTH: - format = KNOB_DEPTH_HOT_TILE_FORMAT; - break; - case SWR_ATTACHMENT_STENCIL: - format = KNOB_STENCIL_HOT_TILE_FORMAT; - break; - default: - SWR_INVALID("Unknown attachment: %d", attachment); - format = KNOB_COLOR_HOT_TILE_FORMAT; - break; - } - - if (hotTile.state == HOTTILE_CLEAR) - { - if (attachment == SWR_ATTACHMENT_STENCIL) - ClearStencilHotTile(&hotTile); - else if (attachment == SWR_ATTACHMENT_DEPTH) - ClearDepthHotTile(&hotTile); - else - ClearColorHotTile(&hotTile); - - hotTile.state = HOTTILE_DIRTY; - } - - if (hotTile.state == HOTTILE_DIRTY) - { - pContext->pfnStoreTile(pDC, - hWorkerPrivateData, - format, - attachment, - x * KNOB_MACROTILE_X_DIM, - y * KNOB_MACROTILE_Y_DIM, - hotTile.renderTargetArrayIndex, - hotTile.pBuffer); - } - - pContext->pfnLoadTile(pDC, - hWorkerPrivateData, - format, - attachment, - x * KNOB_MACROTILE_X_DIM, - y * KNOB_MACROTILE_Y_DIM, - renderTargetArrayIndex, - hotTile.pBuffer); - - hotTile.renderTargetArrayIndex = renderTargetArrayIndex; - hotTile.state = HOTTILE_RESOLVED; - } - } - return &tile.Attachment[attachment]; -} - -HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT* pContext, - DRAW_CONTEXT* pDC, - uint32_t macroID, - SWR_RENDERTARGET_ATTACHMENT attachment, - bool create, - uint32_t numSamples) -{ - uint32_t x, y; - MacroTileMgr::getTileIndices(macroID, x, y); - - SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); - SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); - - HotTileSet& tile = mHotTiles[x][y]; - HOTTILE& hotTile = tile.Attachment[attachment]; - if (hotTile.pBuffer == NULL) - { - if (create) - { - uint32_t size = numSamples * mHotTileSize[attachment]; - hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, 64); - hotTile.state = HOTTILE_INVALID; - hotTile.numSamples = numSamples; - hotTile.renderTargetArrayIndex = 0; - } - else - { - return NULL; - } - } - - return &hotTile; -} - -void HotTileMgr::ClearColorHotTile( - const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. -{ - // Load clear color into SIMD register... - float* pClearData = (float*)(pHotTile->clearData); - simd16scalar valR = _simd16_broadcast_ss(&pClearData[0]); - simd16scalar valG = _simd16_broadcast_ss(&pClearData[1]); - simd16scalar valB = _simd16_broadcast_ss(&pClearData[2]); - simd16scalar valA = _simd16_broadcast_ss(&pClearData[3]); - - float* pfBuf = (float*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); - si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM) - { - _simd16_store_ps(pfBuf, valR); - pfBuf += KNOB_SIMD16_WIDTH; - - _simd16_store_ps(pfBuf, valG); - pfBuf += KNOB_SIMD16_WIDTH; - - _simd16_store_ps(pfBuf, valB); - pfBuf += KNOB_SIMD16_WIDTH; - - _simd16_store_ps(pfBuf, valA); - pfBuf += KNOB_SIMD16_WIDTH; - } - } - } -} - -void HotTileMgr::ClearDepthHotTile( - const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. -{ - // Load clear color into SIMD register... - float* pClearData = (float*)(pHotTile->clearData); - simd16scalar valZ = _simd16_broadcast_ss(&pClearData[0]); - - float* pfBuf = (float*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); - si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM) - { - _simd16_store_ps(pfBuf, valZ); - pfBuf += KNOB_SIMD16_WIDTH; - } - } - } -} - -void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) -{ - // convert from F32 to U8. - uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); - // broadcast 32x into __m256i... - simd16scalari valS = _simd16_set1_epi8(clearVal); - - simd16scalari* pBuf = (simd16scalari*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); - si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4) - { - _simd16_store_si(pBuf, valS); - pBuf += 1; - } - } - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief InitializeHotTiles -/// for draw calls, we initialize the active hot tiles and perform deferred -/// load on them if tile is in invalid state. we do this in the outer thread -/// loop instead of inside the draw routine itself mainly for performance, -/// to avoid unnecessary setup every triangle -/// @todo support deferred clear -/// @param pCreateInfo - pointer to creation info. -void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, - DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t macroID) -{ - const API_STATE& state = GetApiState(pDC); - HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - uint32_t x, y; - MacroTileMgr::getTileIndices(macroID, x, y); - x *= KNOB_MACROTILE_X_DIM; - y *= KNOB_MACROTILE_Y_DIM; - - uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); - - // check RT if enabled - unsigned long rtSlot = 0; - uint32_t colorHottileEnableMask = state.colorHottileEnable; - while (_BitScanForward(&rtSlot, colorHottileEnableMask)) - { - HOTTILE* pHotTile = - GetHotTile(pContext, - pDC, - hWorkerPrivateData, - macroID, - (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), - true, - numSamples); - - if (pHotTile->state == HOTTILE_INVALID) - { - RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId); - // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(pDC, - hWorkerPrivateData, - KNOB_COLOR_HOT_TILE_FORMAT, - (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), - x, - y, - pHotTile->renderTargetArrayIndex, - pHotTile->pBuffer); - pHotTile->state = HOTTILE_RESOLVED; - RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0); - } - else if (pHotTile->state == HOTTILE_CLEAR) - { - RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId); - // Clear the tile. - ClearColorHotTile(pHotTile); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0); - } - colorHottileEnableMask &= ~(1 << rtSlot); - } - - // check depth if enabled - if (state.depthHottileEnable) - { - HOTTILE* pHotTile = GetHotTile( - pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); - if (pHotTile->state == HOTTILE_INVALID) - { - RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId); - // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(pDC, - hWorkerPrivateData, - KNOB_DEPTH_HOT_TILE_FORMAT, - SWR_ATTACHMENT_DEPTH, - x, - y, - pHotTile->renderTargetArrayIndex, - pHotTile->pBuffer); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0); - } - else if (pHotTile->state == HOTTILE_CLEAR) - { - RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId); - // Clear the tile. - ClearDepthHotTile(pHotTile); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0); - } - } - - // check stencil if enabled - if (state.stencilHottileEnable) - { - HOTTILE* pHotTile = GetHotTile( - pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); - if (pHotTile->state == HOTTILE_INVALID) - { - RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId); - // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(pDC, - hWorkerPrivateData, - KNOB_STENCIL_HOT_TILE_FORMAT, - SWR_ATTACHMENT_STENCIL, - x, - y, - pHotTile->renderTargetArrayIndex, - pHotTile->pBuffer); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0); - } - else if (pHotTile->state == HOTTILE_CLEAR) - { - RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId); - // Clear the tile. - ClearStencilHotTile(pHotTile); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0); - } - } -} diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h deleted file mode 100644 index fb8a4a14881..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ /dev/null @@ -1,354 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file tilemgr.h - * - * @brief Definitions for Macro Tile Manager which provides the facilities - * for threads to work on an macro tile. - * - ******************************************************************************/ -#pragma once - -#include <set> -#include <unordered_map> -#include "common/formats.h" -#include "common/intrin.h" -#include "fifo.hpp" -#include "context.h" -#include "format_traits.h" - -////////////////////////////////////////////////////////////////////////// -/// MacroTile - work queue for a tile. -////////////////////////////////////////////////////////////////////////// -struct MacroTileQueue -{ - MacroTileQueue() {} - ~MacroTileQueue() { destroy(); } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Returns number of work items queued for this tile. - uint32_t getNumQueued() { return mFifo.getNumQueued(); } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Attempt to lock the work fifo. If already locked then return false. - bool tryLock() { return mFifo.tryLock(); } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Clear fifo and unlock it. - template <typename ArenaT> - void clear(ArenaT& arena) - { - mFifo.clear(arena); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Peek at work sitting at the front of the fifo. - BE_WORK* peek() { return mFifo.peek(); } - - template <typename ArenaT> - bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry) - { - return mFifo.enqueue_try_nosync(arena, entry); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Move to next work item - void dequeue() { mFifo.dequeue_noinc(); } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Destroy fifo - void destroy() { mFifo.destroy(); } - - ///@todo This will all be private. - uint32_t mWorkItemsFE = 0; - uint32_t mWorkItemsBE = 0; - uint32_t mId = 0; - -private: - QUEUE<BE_WORK> mFifo; -}; - -////////////////////////////////////////////////////////////////////////// -/// MacroTileMgr - Manages macrotiles for a draw. -////////////////////////////////////////////////////////////////////////// -class MacroTileMgr -{ -public: - MacroTileMgr(CachingArena& arena); - ~MacroTileMgr() - { - for (auto* pTile : mTiles) - { - delete pTile; - } - } - - INLINE void initialize() - { - mWorkItemsProduced = 0; - mWorkItemsConsumed = 0; - - mDirtyTiles.clear(); - } - - INLINE std::vector<MacroTileQueue*>& getDirtyTiles() { return mDirtyTiles; } - void markTileComplete(uint32_t id); - - INLINE bool isWorkComplete() { return mWorkItemsProduced == mWorkItemsConsumed; } - - void enqueue(uint32_t x, uint32_t y, BE_WORK* pWork); - - static INLINE void getTileIndices(uint32_t tileID, uint32_t& x, uint32_t& y) - { - // Morton / Z order of tiles - x = pext_u32(tileID, 0x55555555); - y = pext_u32(tileID, 0xAAAAAAAA); - } - - static INLINE uint32_t getTileId(uint32_t x, uint32_t y) - { - // Morton / Z order of tiles - return pdep_u32(x, 0x55555555) | pdep_u32(y, 0xAAAAAAAA); - } - -private: - CachingArena& mArena; - std::vector<MacroTileQueue*> mTiles; - - // Any tile that has work queued to it is a dirty tile. - std::vector<MacroTileQueue*> mDirtyTiles; - - OSALIGNLINE(long) mWorkItemsProduced{0}; - OSALIGNLINE(volatile long) mWorkItemsConsumed{0}; -}; - -typedef void (*PFN_DISPATCH)(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t threadGroupId, - void*& pSpillFillBuffer, - void*& pScratchSpace); - -////////////////////////////////////////////////////////////////////////// -/// DispatchQueue - work queue for dispatch -////////////////////////////////////////////////////////////////////////// -class DispatchQueue -{ -public: - DispatchQueue() {} - - ////////////////////////////////////////////////////////////////////////// - /// @brief Setup the producer consumer counts. - void initialize(uint32_t totalTasks, void* pTaskData, PFN_DISPATCH pfnDispatch) - { - // The available and outstanding counts start with total tasks. - // At the start there are N tasks available and outstanding. - // When both the available and outstanding counts have reached 0 then all work has - // completed. When a worker starts on a threadgroup then it decrements the available count. - // When a worker completes a threadgroup then it decrements the outstanding count. - - mTasksAvailable = totalTasks; - mTasksOutstanding = totalTasks; - - mpTaskData = pTaskData; - mPfnDispatch = pfnDispatch; - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Returns number of tasks available for this dispatch. - uint32_t getNumQueued() { return (mTasksAvailable > 0) ? mTasksAvailable : 0; } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Atomically decrement the work available count. If the result - // is greater than 0 then we can on the associated thread group. - // Otherwise, there is no more work to do. - bool getWork(uint32_t& groupId) - { - long result = InterlockedDecrement(&mTasksAvailable); - - if (result >= 0) - { - groupId = result; - return true; - } - - return false; - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Atomically decrement the outstanding count. A worker is notifying - /// us that he just finished some work. Also, return true if we're - /// the last worker to complete this dispatch. - bool finishedWork() - { - long result = InterlockedDecrement(&mTasksOutstanding); - SWR_ASSERT(result >= 0, "Should never oversubscribe work"); - - return (result == 0) ? true : false; - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Work is complete once both the available/outstanding counts have reached 0. - bool isWorkComplete() { return ((mTasksAvailable <= 0) && (mTasksOutstanding <= 0)); } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Return pointer to task data. - const void* GetTasksData() { return mpTaskData; } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Dispatches a unit of work - void dispatch(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t threadGroupId, - void*& pSpillFillBuffer, - void*& pScratchSpace) - { - SWR_ASSERT(mPfnDispatch != nullptr); - mPfnDispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace); - } - - void* mpTaskData{nullptr}; // The API thread will set this up and the callback task function - // will interpet this. - PFN_DISPATCH mPfnDispatch{nullptr}; // Function to call per dispatch - - OSALIGNLINE(volatile long) mTasksAvailable{0}; - OSALIGNLINE(volatile long) mTasksOutstanding{0}; -}; - -/// @note this enum needs to be kept in sync with SWR_TILE_STATE! -enum HOTTILE_STATE -{ - HOTTILE_INVALID, // tile is in uninitialized state and should be loaded with surface contents - // before rendering - HOTTILE_CLEAR, // tile should be cleared - HOTTILE_DIRTY, // tile has been rendered to - HOTTILE_RESOLVED, // tile is consistent with memory (either loaded or stored) -}; - -struct HOTTILE -{ - uint8_t* pBuffer; - HOTTILE_STATE state; - uint32_t clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for - // alignment? - uint32_t numSamples; - uint32_t renderTargetArrayIndex; // current render target array index loaded -}; - -union HotTileSet -{ - struct - { - HOTTILE Color[SWR_NUM_RENDERTARGETS]; - HOTTILE Depth; - HOTTILE Stencil; - }; - HOTTILE Attachment[SWR_NUM_ATTACHMENTS]; -}; - -class HotTileMgr -{ -public: - HotTileMgr() - { - memset(mHotTiles, 0, sizeof(mHotTiles)); - - // cache hottile size - for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i) - { - mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * - FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8; - } - mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * - FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; - mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * - FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; - } - - ~HotTileMgr() - { - for (int x = 0; x < KNOB_NUM_HOT_TILES_X; ++x) - { - for (int y = 0; y < KNOB_NUM_HOT_TILES_Y; ++y) - { - for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a) - { - FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer); - } - } - } - } - - void InitializeHotTiles(SWR_CONTEXT* pContext, - DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t macroID); - - HOTTILE* GetHotTile(SWR_CONTEXT* pContext, - DRAW_CONTEXT* pDC, - HANDLE hWorkerData, - uint32_t macroID, - SWR_RENDERTARGET_ATTACHMENT attachment, - bool create, - uint32_t numSamples = 1, - uint32_t renderTargetArrayIndex = 0); - - HOTTILE* GetHotTileNoLoad(SWR_CONTEXT* pContext, - DRAW_CONTEXT* pDC, - uint32_t macroID, - SWR_RENDERTARGET_ATTACHMENT attachment, - bool create, - uint32_t numSamples = 1); - - static void ClearColorHotTile(const HOTTILE* pHotTile); - static void ClearDepthHotTile(const HOTTILE* pHotTile); - static void ClearStencilHotTile(const HOTTILE* pHotTile); - -private: - HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y]; - uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; - - void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode) - { - void* p = nullptr; -#if defined(_WIN32) - HANDLE hProcess = GetCurrentProcess(); - p = VirtualAllocExNuma( - hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode); -#else - p = AlignedMalloc(size, align); -#endif - - return p; - } - - void FreeHotTileMem(void* pBuffer) - { - if (pBuffer) - { -#if defined(_WIN32) - VirtualFree(pBuffer, 0, MEM_RELEASE); -#else - AlignedFree(pBuffer); -#endif - } - } -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/tileset.h b/src/gallium/drivers/swr/rasterizer/core/tileset.h deleted file mode 100644 index e28c84d789f..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/tileset.h +++ /dev/null @@ -1,102 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file tileset.h - * - * @brief Custom bitset class for managing locked tiles - * - ******************************************************************************/ -#pragma once - -struct TileSet -{ - ~TileSet() - { - if (m_bits) - { - AlignedFree(m_bits); - } - } - INLINE void set(size_t idx) - { - _grow(idx); - size_t& word = _get_word(idx); - word |= (size_t(1) << (idx & BITS_OFFSET)); - m_maxSet = std::max(m_maxSet, idx + 1); - } - INLINE bool get(size_t idx) - { - if (idx >= m_size) - { - return false; - } - size_t word = _get_word(idx); - return 0 != (word & (size_t(1) << (idx & BITS_OFFSET))); - } - - INLINE void clear() - { - if (m_maxSet) - { - size_t num_words = (m_maxSet + BITS_OFFSET) / BITS_PER_WORD; - memset(m_bits, 0, sizeof(size_t) * num_words); - m_maxSet = 0; - } - } - -private: - static const size_t BITS_PER_WORD = sizeof(size_t) * 8; - static const size_t BITS_OFFSET = BITS_PER_WORD - 1; - - size_t m_size = 0; - size_t m_maxSet = 0; - size_t* m_bits = nullptr; - - INLINE size_t& _get_word(size_t idx) { return m_bits[idx / BITS_PER_WORD]; } - - void _grow(size_t idx) - { - if (idx < m_size) - { - return; - } - - size_t new_size = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET; - size_t num_words = new_size / BITS_PER_WORD; - size_t* newBits = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64); - size_t copy_words = 0; - - if (m_bits) - { - copy_words = (m_size + BITS_OFFSET) / BITS_PER_WORD; - num_words -= copy_words; - memcpy(newBits, m_bits, copy_words * sizeof(size_t)); - - AlignedFree(m_bits); - } - - m_bits = newBits; - m_size = new_size; - - memset(&m_bits[copy_words], 0, sizeof(size_t) * num_words); - } -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h deleted file mode 100644 index 9b483776be9..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ /dev/null @@ -1,392 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file utils.h - * - * @brief Utilities used by SWR core. - * - ******************************************************************************/ -#pragma once - -#include <string.h> -#include <type_traits> -#include <algorithm> -#include <array> -#include "common/os.h" -#include "common/intrin.h" -#include "common/swr_assert.h" -#include "core/api.h" - -struct simdBBox -{ - simdscalari ymin; - simdscalari ymax; - simdscalari xmin; - simdscalari xmax; -}; - -struct simd16BBox -{ - simd16scalari ymin; - simd16scalari ymax; - simd16scalari xmin; - simd16scalari xmax; -}; - -template <typename SIMD_T> -struct SIMDBBOX_T -{ - typename SIMD_T::Integer ymin; - typename SIMD_T::Integer ymax; - typename SIMD_T::Integer xmin; - typename SIMD_T::Integer xmax; -}; - -// helper function to unroll loops -template <int Begin, int End, int Step = 1> -struct UnrollerL -{ - template <typename Lambda> - INLINE static void step(Lambda& func) - { - func(Begin); - UnrollerL<Begin + Step, End, Step>::step(func); - } -}; - -template <int End, int Step> -struct UnrollerL<End, End, Step> -{ - template <typename Lambda> - static void step(Lambda& func) - { - } -}; - -// helper function to unroll loops, with mask to skip specific iterations -template <int Begin, int End, int Step = 1, int Mask = 0x7f> -struct UnrollerLMask -{ - template <typename Lambda> - INLINE static void step(Lambda& func) - { - if (Mask & (1 << Begin)) - { - func(Begin); - } - UnrollerL<Begin + Step, End, Step>::step(func); - } -}; - -template <int End, int Step, int Mask> -struct UnrollerLMask<End, End, Step, Mask> -{ - template <typename Lambda> - static void step(Lambda& func) - { - } -}; - -// general CRC compute -INLINE -uint32_t ComputeCRC(uint32_t crc, const void* pData, uint32_t size) -{ -#if defined(_WIN64) || defined(__x86_64__) - uint32_t sizeInQwords = size / sizeof(uint64_t); - uint32_t sizeRemainderBytes = size % sizeof(uint64_t); - uint64_t* pDataWords = (uint64_t*)pData; - for (uint32_t i = 0; i < sizeInQwords; ++i) - { - crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); - } -#else - uint32_t sizeInDwords = size / sizeof(uint32_t); - uint32_t sizeRemainderBytes = size % sizeof(uint32_t); - uint32_t* pDataWords = (uint32_t*)pData; - for (uint32_t i = 0; i < sizeInDwords; ++i) - { - crc = _mm_crc32_u32(crc, *pDataWords++); - } -#endif - - uint8_t* pRemainderBytes = (uint8_t*)pDataWords; - for (uint32_t i = 0; i < sizeRemainderBytes; ++i) - { - crc = _mm_crc32_u8(crc, *pRemainderBytes++); - } - - return crc; -} - -////////////////////////////////////////////////////////////////////////// -/// Check specified bit within a data word -////////////////////////////////////////////////////////////////////////// -template <typename T> -INLINE static bool CheckBit(T word, uint32_t bit) -{ - return 0 != (word & (T(1) << bit)); -} - -////////////////////////////////////////////////////////////////////////// -/// Add byte offset to any-type pointer -////////////////////////////////////////////////////////////////////////// -template <typename T> -INLINE static T* PtrAdd(T* p, intptr_t offset) -{ - intptr_t intp = reinterpret_cast<intptr_t>(p); - return reinterpret_cast<T*>(intp + offset); -} - -////////////////////////////////////////////////////////////////////////// -/// Is a power-of-2? -////////////////////////////////////////////////////////////////////////// -template <typename T> -INLINE static bool IsPow2(T value) -{ - return value == (value & (T(0) - value)); -} - -////////////////////////////////////////////////////////////////////////// -/// Align down to specified alignment -/// Note: IsPow2(alignment) MUST be true -////////////////////////////////////////////////////////////////////////// -template <typename T1, typename T2> -INLINE static T1 AlignDownPow2(T1 value, T2 alignment) -{ - SWR_ASSERT(IsPow2(alignment)); - return value & ~T1(alignment - 1); -} - -////////////////////////////////////////////////////////////////////////// -/// Align up to specified alignment -/// Note: IsPow2(alignment) MUST be true -////////////////////////////////////////////////////////////////////////// -template <typename T1, typename T2> -INLINE static T1 AlignUpPow2(T1 value, T2 alignment) -{ - return AlignDownPow2(value + T1(alignment - 1), alignment); -} - -////////////////////////////////////////////////////////////////////////// -/// Align up ptr to specified alignment -/// Note: IsPow2(alignment) MUST be true -////////////////////////////////////////////////////////////////////////// -template <typename T1, typename T2> -INLINE static T1* AlignUpPow2(T1* value, T2 alignment) -{ - return reinterpret_cast<T1*>( - AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment)); -} - -////////////////////////////////////////////////////////////////////////// -/// Align down to specified alignment -////////////////////////////////////////////////////////////////////////// -template <typename T1, typename T2> -INLINE static T1 AlignDown(T1 value, T2 alignment) -{ - if (IsPow2(alignment)) - { - return AlignDownPow2(value, alignment); - } - return value - T1(value % alignment); -} - -////////////////////////////////////////////////////////////////////////// -/// Align down to specified alignment -////////////////////////////////////////////////////////////////////////// -template <typename T1, typename T2> -INLINE static T1* AlignDown(T1* value, T2 alignment) -{ - return (T1*)AlignDown(uintptr_t(value), alignment); -} - -////////////////////////////////////////////////////////////////////////// -/// Align up to specified alignment -/// Note: IsPow2(alignment) MUST be true -////////////////////////////////////////////////////////////////////////// -template <typename T1, typename T2> -INLINE static T1 AlignUp(T1 value, T2 alignment) -{ - return AlignDown(value + T1(alignment - 1), alignment); -} - -////////////////////////////////////////////////////////////////////////// -/// Align up to specified alignment -/// Note: IsPow2(alignment) MUST be true -////////////////////////////////////////////////////////////////////////// -template <typename T1, typename T2> -INLINE static T1* AlignUp(T1* value, T2 alignment) -{ - return AlignDown(PtrAdd(value, alignment - 1), alignment); -} - -////////////////////////////////////////////////////////////////////////// -/// Helper structure used to access an array of elements that don't -/// correspond to a typical word size. -////////////////////////////////////////////////////////////////////////// -template <typename T, size_t BitsPerElementT, size_t ArrayLenT> -class BitsArray -{ -private: - static const size_t BITS_PER_WORD = sizeof(size_t) * 8; - static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT; - static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD; - static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1; - - static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD, - "Element size must an integral fraction of pointer size"); - - size_t m_words[NUM_WORDS] = {}; - -public: - T operator[](size_t elementIndex) const - { - size_t word = m_words[elementIndex / ELEMENTS_PER_WORD]; - word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT); - return T(word & ELEMENT_MASK); - } -}; - -// Ranged integer argument for TemplateArgUnroller -template <typename T, T TMin, T TMax> -struct RangedArg -{ - T val; -}; - -template <uint32_t TMin, uint32_t TMax> -using IntArg = RangedArg<uint32_t, TMin, TMax>; - -// Recursive template used to auto-nest conditionals. Converts dynamic boolean function -// arguments to static template arguments. -template <typename TermT, typename... ArgsB> -struct TemplateArgUnroller -{ - //----------------------------------------- - // Boolean value - //----------------------------------------- - - // Last Arg Terminator - static typename TermT::FuncType GetFunc(bool bArg) - { - if (bArg) - { - return TermT::template GetFunc<ArgsB..., std::true_type>(); - } - - return TermT::template GetFunc<ArgsB..., std::false_type>(); - } - - // Recursively parse args - template <typename... TArgsT> - static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs) - { - if (bArg) - { - return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...); - } - - return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...); - } - - //----------------------------------------- - // Ranged value (within specified range) - //----------------------------------------- - - // Last Arg Terminator - template <typename T, T TMin, T TMax> - static typename TermT::FuncType GetFunc(RangedArg<T, TMin, TMax> iArg) - { - if (iArg.val == TMax) - { - return TermT::template GetFunc<ArgsB..., std::integral_constant<T, TMax>>(); - } - if (TMax > TMin) - { - return TemplateArgUnroller<TermT, ArgsB...>::GetFunc( - RangedArg<T, TMin, (T)(int(TMax) - 1)>{iArg.val}); - } - SWR_ASSUME(false); - return nullptr; - } - template <typename T, T TVal> - static typename TermT::FuncType GetFunc(RangedArg<T, TVal, TVal> iArg) - { - SWR_ASSERT(iArg.val == TVal); - return TermT::template GetFunc<ArgsB..., std::integral_constant<T, TVal>>(); - } - - // Recursively parse args - template <typename T, T TMin, T TMax, typename... TArgsT> - static typename TermT::FuncType GetFunc(RangedArg<T, TMin, TMax> iArg, TArgsT... remainingArgs) - { - if (iArg.val == TMax) - { - return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TMax>>::GetFunc( - remainingArgs...); - } - if (TMax > TMin) - { - return TemplateArgUnroller<TermT, ArgsB...>::GetFunc( - RangedArg<T, TMin, (T)(int(TMax) - 1)>{iArg.val}, remainingArgs...); - } - SWR_ASSUME(false); - return nullptr; - } - template <typename T, T TVal, typename... TArgsT> - static typename TermT::FuncType GetFunc(RangedArg<T, TVal, TVal> iArg, TArgsT... remainingArgs) - { - SWR_ASSERT(iArg.val == TVal); - return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TVal>>::GetFunc( - remainingArgs...); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// Helpers used to get / set environment variable -////////////////////////////////////////////////////////////////////////// -static INLINE std::string GetEnv(const std::string& variableName) -{ - std::string output; -#if defined(_WIN32) - uint32_t valueSize = GetEnvironmentVariableA(variableName.c_str(), nullptr, 0); - if (!valueSize) - return output; - output.resize(valueSize - 1); // valueSize includes null, output.resize() does not - GetEnvironmentVariableA(variableName.c_str(), &output[0], valueSize); -#else - char* env = getenv(variableName.c_str()); - output = env ? env : ""; -#endif - - return output; -} - -static INLINE void SetEnv(const std::string& variableName, const std::string& value) -{ -#if defined(_WIN32) - SetEnvironmentVariableA(variableName.c_str(), value.c_str()); -#else - setenv(variableName.c_str(), value.c_str(), true); -#endif -} - diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp deleted file mode 100644 index 44482939c76..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ /dev/null @@ -1,853 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file JitManager.cpp - * - * @brief Implementation if the Jit Manager. - * - * Notes: - * - ******************************************************************************/ -#include "jit_pch.hpp" - -#include "JitManager.h" -#include "jit_api.h" -#include "fetch_jit.h" - -#include "core/state.h" - -#include "gen_state_llvm.h" - -#include <sstream> -#if defined(_WIN32) -#include <psapi.h> -#include <cstring> - -#define INTEL_OUTPUT_DIR "c:\\Intel" -#define SWR_OUTPUT_DIR INTEL_OUTPUT_DIR "\\SWR" -#define JITTER_OUTPUT_DIR SWR_OUTPUT_DIR "\\Jitter" -#endif // _WIN32 - -#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) -#include <pwd.h> -#include <sys/stat.h> -#endif - - -using namespace llvm; -using namespace SwrJit; - -////////////////////////////////////////////////////////////////////////// -/// @brief Contructor for JitManager. -/// @param simdWidth - SIMD width to be used in generated program. -JitManager::JitManager(uint32_t simdWidth, const char* arch, const char* core) : - mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), - mArch(arch) -{ - mpCurrentModule = nullptr; - mpExec = nullptr; - - InitializeNativeTarget(); - InitializeNativeTargetAsmPrinter(); - InitializeNativeTargetDisassembler(); - - - // force JIT to use the same CPU arch as the rest of swr - if (mArch.AVX512F()) - { -#if USE_SIMD16_SHADERS - if (mArch.AVX512ER()) - { - mHostCpuName = StringRef("knl"); - } - else - { - mHostCpuName = StringRef("skylake-avx512"); - } - mUsingAVX512 = true; -#else - mHostCpuName = StringRef("core-avx2"); -#endif - if (mVWidth == 0) - { - mVWidth = 8; - } - } - else if (mArch.AVX2()) - { - mHostCpuName = StringRef("core-avx2"); - if (mVWidth == 0) - { - mVWidth = 8; - } - } - else if (mArch.AVX()) - { - if (mArch.F16C()) - { - mHostCpuName = StringRef("core-avx-i"); - } - else - { - mHostCpuName = StringRef("corei7-avx"); - } - if (mVWidth == 0) - { - mVWidth = 8; - } - } - else - { - SWR_INVALID("Jitting requires at least AVX ISA support"); - } - - - mOptLevel = CodeGenOpt::Aggressive; - - if (KNOB_JIT_OPTIMIZATION_LEVEL >= CodeGenOpt::None && - KNOB_JIT_OPTIMIZATION_LEVEL <= CodeGenOpt::Aggressive) - { - mOptLevel = CodeGenOpt::Level(KNOB_JIT_OPTIMIZATION_LEVEL); - } - - if (KNOB_JIT_ENABLE_CACHE) - { - mCache.Init(this, mHostCpuName, mOptLevel); - } - - SetupNewModule(); - mIsModuleFinalized = true; - - // fetch function signature -#if USE_SIMD16_SHADERS - // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simd16vertex& out); -#else - // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); -#endif - std::vector<Type*> fsArgs; - - // llvm5 is picky and does not take a void * type - fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0)); - - fsArgs.push_back(Type::getInt8PtrTy(mContext)); - - fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0)); -#if USE_SIMD16_SHADERS - fsArgs.push_back(PointerType::get(Gen_simd16vertex(this), 0)); -#else - fsArgs.push_back(PointerType::get(Gen_simdvertex(this), 0)); -#endif - - mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, false); - -#if defined(_MSC_VER) - // explicitly instantiate used symbols from potentially staticly linked libs - sys::DynamicLibrary::AddSymbol("exp2f", &exp2f); - sys::DynamicLibrary::AddSymbol("log2f", &log2f); - sys::DynamicLibrary::AddSymbol("sinf", &sinf); - sys::DynamicLibrary::AddSymbol("cosf", &cosf); - sys::DynamicLibrary::AddSymbol("powf", &powf); -#endif - -#if defined(_WIN32) - if (KNOB_DUMP_SHADER_IR) - { - CreateDirectoryPath(INTEL_OUTPUT_DIR); - CreateDirectoryPath(SWR_OUTPUT_DIR); - CreateDirectoryPath(JITTER_OUTPUT_DIR); - } -#endif -} - -void JitManager::CreateExecEngine(std::unique_ptr<Module> pModule) -{ - TargetOptions tOpts; - tOpts.AllowFPOpFusion = FPOpFusion::Fast; - tOpts.NoInfsFPMath = false; - tOpts.NoNaNsFPMath = false; - tOpts.UnsafeFPMath = false; - - // tOpts.PrintMachineCode = true; - - mpExec = EngineBuilder(std::move(pModule)) - .setTargetOptions(tOpts) - .setOptLevel(mOptLevel) - .setMCPU(mHostCpuName) - .create(); - - if (KNOB_JIT_ENABLE_CACHE) - { - mpExec->setObjectCache(&mCache); - } - -#if LLVM_USE_INTEL_JITEVENTS - JITEventListener* vTune = JITEventListener::createIntelJITEventListener(); - mpExec->RegisterJITEventListener(vTune); -#endif - - mvExecEngines.push_back(mpExec); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Create new LLVM module. -void JitManager::SetupNewModule() -{ - SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!"); - - std::unique_ptr<Module> newModule(new Module("", mContext)); - mpCurrentModule = newModule.get(); - mpCurrentModule->setTargetTriple(sys::getProcessTriple()); - CreateExecEngine(std::move(newModule)); - mIsModuleFinalized = false; -} - - -DIType* -JitManager::CreateDebugStructType(StructType* pType, - const std::string& name, - DIFile* pFile, - uint32_t lineNum, - const std::vector<std::pair<std::string, uint32_t>>& members) -{ - DIBuilder builder(*mpCurrentModule); - SmallVector<Metadata*, 8> ElemTypes; - DataLayout DL = DataLayout(mpCurrentModule); - uint32_t size = DL.getTypeAllocSizeInBits(pType); - uint32_t alignment = DL.getABITypeAlignment(pType); - DINode::DIFlags flags = DINode::DIFlags::FlagPublic; - - DICompositeType* pDIStructTy = builder.createStructType(pFile, - name, - pFile, - lineNum, - size, - alignment, - flags, - nullptr, - builder.getOrCreateArray(ElemTypes)); - - // Register mapping now to break loops (in case struct contains itself or pointers to itself) - mDebugStructMap[pType] = pDIStructTy; - - uint32_t idx = 0; - for (auto& elem : pType->elements()) - { - std::string name = members[idx].first; - uint32_t lineNum = members[idx].second; - size = DL.getTypeAllocSizeInBits(elem); - alignment = DL.getABITypeAlignment(elem); - uint32_t offset = DL.getStructLayout(pType)->getElementOffsetInBits(idx); - llvm::DIType* pDebugTy = GetDebugType(elem); - ElemTypes.push_back(builder.createMemberType( - pDIStructTy, name, pFile, lineNum, size, alignment, offset, flags, pDebugTy)); - - idx++; - } - - pDIStructTy->replaceElements(builder.getOrCreateArray(ElemTypes)); - return pDIStructTy; -} - -DIType* JitManager::GetDebugArrayType(Type* pTy) -{ - DIBuilder builder(*mpCurrentModule); - DataLayout DL = DataLayout(mpCurrentModule); - ArrayType* pArrayTy = cast<ArrayType>(pTy); - uint32_t size = DL.getTypeAllocSizeInBits(pArrayTy); - uint32_t alignment = DL.getABITypeAlignment(pArrayTy); - - SmallVector<Metadata*, 8> Elems; - Elems.push_back(builder.getOrCreateSubrange(0, pArrayTy->getNumElements())); - return builder.createArrayType( - size, alignment, GetDebugType(pArrayTy->getElementType()), builder.getOrCreateArray(Elems)); -} - -// Create a DIType from llvm Type -DIType* JitManager::GetDebugType(Type* pTy) -{ - DIBuilder builder(*mpCurrentModule); - Type::TypeID id = pTy->getTypeID(); - - switch (id) - { - case Type::VoidTyID: - return builder.createUnspecifiedType("void"); - break; - case Type::HalfTyID: - return builder.createBasicType("float16", 16, dwarf::DW_ATE_float); - break; - case Type::FloatTyID: - return builder.createBasicType("float", 32, dwarf::DW_ATE_float); - break; - case Type::DoubleTyID: - return builder.createBasicType("double", 64, dwarf::DW_ATE_float); - break; - case Type::IntegerTyID: - return GetDebugIntegerType(pTy); - break; - case Type::StructTyID: - return GetDebugStructType(pTy); - break; - case Type::ArrayTyID: - return GetDebugArrayType(pTy); - break; - case Type::PointerTyID: - return builder.createPointerType(GetDebugType(pTy->getPointerElementType()), 64, 64); - break; -#if LLVM_VERSION_MAJOR >= 11 - case Type::FixedVectorTyID: -#else - case Type::VectorTyID: -#endif - return GetDebugVectorType(pTy); - break; - case Type::FunctionTyID: - return GetDebugFunctionType(pTy); - break; - default: - SWR_ASSERT(false, "Unimplemented llvm type"); - } - return nullptr; -} - -// Create a DISubroutineType from an llvm FunctionType -DIType* JitManager::GetDebugFunctionType(Type* pTy) -{ - SmallVector<Metadata*, 8> ElemTypes; - FunctionType* pFuncTy = cast<FunctionType>(pTy); - DIBuilder builder(*mpCurrentModule); - - // Add result type - ElemTypes.push_back(GetDebugType(pFuncTy->getReturnType())); - - // Add arguments - for (auto& param : pFuncTy->params()) - { - ElemTypes.push_back(GetDebugType(param)); - } - - return builder.createSubroutineType(builder.getOrCreateTypeArray(ElemTypes)); -} - -DIType* JitManager::GetDebugIntegerType(Type* pTy) -{ - DIBuilder builder(*mpCurrentModule); - IntegerType* pIntTy = cast<IntegerType>(pTy); - switch (pIntTy->getBitWidth()) - { - case 1: - return builder.createBasicType("int1", 1, dwarf::DW_ATE_unsigned); - break; - case 8: - return builder.createBasicType("int8", 8, dwarf::DW_ATE_signed); - break; - case 16: - return builder.createBasicType("int16", 16, dwarf::DW_ATE_signed); - break; - case 32: - return builder.createBasicType("int", 32, dwarf::DW_ATE_signed); - break; - case 64: - return builder.createBasicType("int64", 64, dwarf::DW_ATE_signed); - break; - case 128: - return builder.createBasicType("int128", 128, dwarf::DW_ATE_signed); - break; - default: - SWR_ASSERT(false, "Unimplemented integer bit width"); - } - return nullptr; -} - -DIType* JitManager::GetDebugVectorType(Type* pTy) -{ - DIBuilder builder(*mpCurrentModule); -#if LLVM_VERSION_MAJOR >= 12 - FixedVectorType* pVecTy = cast<FixedVectorType>(pTy); -#elif LLVM_VERSION_MAJOR >= 11 - VectorType* pVecTy = cast<VectorType>(pTy); -#else - auto pVecTy = pTy; -#endif - DataLayout DL = DataLayout(mpCurrentModule); - uint32_t size = DL.getTypeAllocSizeInBits(pVecTy); - uint32_t alignment = DL.getABITypeAlignment(pVecTy); - SmallVector<Metadata*, 1> Elems; - -#if LLVM_VERSION_MAJOR >= 11 - Elems.push_back(builder.getOrCreateSubrange(0, pVecTy->getNumElements())); -#else - Elems.push_back(builder.getOrCreateSubrange(0, pVecTy->getVectorNumElements())); -#endif - - return builder.createVectorType(size, - alignment, -#if LLVM_VERSION_MAJOR >= 11 - GetDebugType(pVecTy->getElementType()), -#else - GetDebugType(pVecTy->getVectorElementType()), -#endif - builder.getOrCreateArray(Elems)); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Dump function x86 assembly to file. -/// @note This should only be called after the module has been jitted to x86 and the -/// module will not be further accessed. -void JitManager::DumpAsm(Function* pFunction, const char* fileName) -{ - if (KNOB_DUMP_SHADER_IR) - { -#if defined(_WIN32) - DWORD pid = GetCurrentProcessId(); - char procname[MAX_PATH]; - GetModuleFileNameA(NULL, procname, MAX_PATH); - const char* pBaseName = strrchr(procname, '\\'); - std::stringstream outDir; - outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends; - CreateDirectoryPath(outDir.str().c_str()); -#endif - - std::error_code EC; - Module* pModule = pFunction->getParent(); - const char* funcName = pFunction->getName().data(); - char fName[256]; -#if defined(_WIN32) - sprintf(fName, "%s\\%s.%s.asm", outDir.str().c_str(), funcName, fileName); -#else - sprintf(fName, "%s.%s.asm", funcName, fileName); -#endif - - raw_fd_ostream filestream(fName, EC, llvm::sys::fs::F_None); - - legacy::PassManager* pMPasses = new legacy::PassManager(); - auto* pTarget = mpExec->getTargetMachine(); - pTarget->Options.MCOptions.AsmVerbose = true; -#if LLVM_VERSION_MAJOR >= 10 - pTarget->addPassesToEmitFile( - *pMPasses, filestream, nullptr, CGFT_AssemblyFile); -#elif LLVM_VERSION_MAJOR >= 7 - pTarget->addPassesToEmitFile( - *pMPasses, filestream, nullptr, TargetMachine::CGFT_AssemblyFile); -#else - pTarget->addPassesToEmitFile(*pMPasses, filestream, TargetMachine::CGFT_AssemblyFile); -#endif - pMPasses->run(*pModule); - delete pMPasses; - pTarget->Options.MCOptions.AsmVerbose = false; - } -} - -std::string JitManager::GetOutputDir() -{ -#if defined(_WIN32) - DWORD pid = GetCurrentProcessId(); - char procname[MAX_PATH]; - GetModuleFileNameA(NULL, procname, MAX_PATH); - const char* pBaseName = strrchr(procname, '\\'); - std::stringstream outDir; - outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid; - CreateDirectoryPath(outDir.str().c_str()); - return outDir.str(); -#endif - return ""; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Dump function to file. -void JitManager::DumpToFile(Module* M, - const char* fileName, - llvm::AssemblyAnnotationWriter* annotater) -{ - if (KNOB_DUMP_SHADER_IR) - { - std::string outDir = GetOutputDir(); - - std::error_code EC; - const char* funcName = M->getName().data(); - char fName[256]; -#if defined(_WIN32) - sprintf(fName, "%s\\%s.%s.ll", outDir.c_str(), funcName, fileName); -#else - sprintf(fName, "%s.%s.ll", funcName, fileName); -#endif - raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None); - M->print(fd, annotater); - fd.flush(); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Dump function to file. -void JitManager::DumpToFile(Function* f, const char* fileName) -{ - if (KNOB_DUMP_SHADER_IR) - { - std::string outDir = GetOutputDir(); - - std::error_code EC; - const char* funcName = f->getName().data(); - char fName[256]; -#if defined(_WIN32) - sprintf(fName, "%s\\%s.%s.ll", outDir.c_str(), funcName, fileName); -#else - sprintf(fName, "%s.%s.ll", funcName, fileName); -#endif - raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None); - f->print(fd, nullptr); - -#if defined(_WIN32) - sprintf(fName, "%s\\cfg.%s.%s.dot", outDir.c_str(), funcName, fileName); -#else - sprintf(fName, "cfg.%s.%s.dot", funcName, fileName); -#endif - fd.flush(); - - raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text); - WriteGraph(fd_cfg, (const Function*)f); - - fd_cfg.flush(); - } -} - -extern "C" { -bool g_DllActive = true; - -////////////////////////////////////////////////////////////////////////// -/// @brief Create JIT context. -/// @param simdWidth - SIMD width to be used in generated program. -HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch, const char* core) -{ - return new JitManager(targetSimdWidth, arch, core); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Destroy JIT context. -void JITCALL JitDestroyContext(HANDLE hJitContext) -{ - if (g_DllActive) - { - delete reinterpret_cast<JitManager*>(hJitContext); - } -} -} - -////////////////////////////////////////////////////////////////////////// -/// JitCache -////////////////////////////////////////////////////////////////////////// - -////////////////////////////////////////////////////////////////////////// -/// JitCacheFileHeader -////////////////////////////////////////////////////////////////////////// -struct JitCacheFileHeader -{ - void Init(uint32_t llCRC, - uint32_t objCRC, - const std::string& moduleID, - const std::string& cpu, - uint32_t optLevel, - uint64_t objSize) - { - m_objSize = objSize; - m_llCRC = llCRC; - m_objCRC = objCRC; - strncpy(m_ModuleID, moduleID.c_str(), JC_STR_MAX_LEN - 1); - m_ModuleID[JC_STR_MAX_LEN - 1] = 0; - strncpy(m_Cpu, cpu.c_str(), JC_STR_MAX_LEN - 1); - m_Cpu[JC_STR_MAX_LEN - 1] = 0; - m_optLevel = optLevel; - } - - - bool - IsValid(uint32_t llCRC, const std::string& moduleID, const std::string& cpu, uint32_t optLevel) - { - if ((m_MagicNumber != JC_MAGIC_NUMBER) || (m_llCRC != llCRC) || - (m_platformKey != JC_PLATFORM_KEY) || (m_optLevel != optLevel)) - { - return false; - } - - m_ModuleID[JC_STR_MAX_LEN - 1] = 0; - if (strncmp(moduleID.c_str(), m_ModuleID, JC_STR_MAX_LEN - 1)) - { - return false; - } - - m_Cpu[JC_STR_MAX_LEN - 1] = 0; - if (strncmp(cpu.c_str(), m_Cpu, JC_STR_MAX_LEN - 1)) - { - return false; - } - - return true; - } - - uint64_t GetObjectSize() const { return m_objSize; } - uint64_t GetObjectCRC() const { return m_objCRC; } - -private: - static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543210ULL + 7; - static const size_t JC_STR_MAX_LEN = 32; - static const uint32_t JC_PLATFORM_KEY = (LLVM_VERSION_MAJOR << 24) | - (LLVM_VERSION_MINOR << 16) | (LLVM_VERSION_PATCH << 8) | - ((sizeof(void*) > sizeof(uint32_t)) ? 1 : 0); - - uint64_t m_MagicNumber = JC_MAGIC_NUMBER; - uint64_t m_objSize = 0; - uint32_t m_llCRC = 0; - uint32_t m_platformKey = JC_PLATFORM_KEY; - uint32_t m_objCRC = 0; - uint32_t m_optLevel = 0; - char m_ModuleID[JC_STR_MAX_LEN] = {}; - char m_Cpu[JC_STR_MAX_LEN] = {}; -}; - -static inline uint32_t ComputeModuleCRC(const llvm::Module* M) -{ - std::string bitcodeBuffer; - raw_string_ostream bitcodeStream(bitcodeBuffer); - -#if LLVM_VERSION_MAJOR >= 7 - llvm::WriteBitcodeToFile(*M, bitcodeStream); -#else - llvm::WriteBitcodeToFile(M, bitcodeStream); -#endif - // M->print(bitcodeStream, nullptr, false); - - bitcodeStream.flush(); - - return ComputeCRC(0, bitcodeBuffer.data(), bitcodeBuffer.size()); -} - -/// constructor -JitCache::JitCache() -{ -#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) - if (strncmp(KNOB_JIT_CACHE_DIR.c_str(), "~/", 2) == 0) - { - char* homedir; - if (!(homedir = getenv("HOME"))) - { - homedir = getpwuid(getuid())->pw_dir; - } - mCacheDir = homedir; - mCacheDir += (KNOB_JIT_CACHE_DIR.c_str() + 1); - } - else -#endif - { - mCacheDir = KNOB_JIT_CACHE_DIR; - } - - // Create cache dir at startup to allow jitter to write debug.ll files - // to that directory. - if (!llvm::sys::fs::exists(mCacheDir.str()) && - llvm::sys::fs::create_directories(mCacheDir.str())) - { - SWR_INVALID("Unable to create directory: %s", mCacheDir.c_str()); - } - -} - -int ExecUnhookedProcess(const std::string& CmdLine, std::string* pStdOut, std::string* pStdErr) -{ - - return ExecCmd(CmdLine, nullptr, pStdOut, pStdErr); -} - -/// Calculate actual directory where module will be cached. -/// This is always a subdirectory of mCacheDir. Full absolute -/// path name will be stored in mCurrentModuleCacheDir -void JitCache::CalcModuleCacheDir() -{ - mModuleCacheDir.clear(); - - llvm::SmallString<MAX_PATH> moduleDir = mCacheDir; - - // Create 4 levels of directory hierarchy based on CRC, 256 entries each - uint8_t* pCRC = (uint8_t*)&mCurrentModuleCRC; - for (uint32_t i = 0; i < 4; ++i) - { - llvm::sys::path::append(moduleDir, std::to_string((int)pCRC[i])); - } - - mModuleCacheDir = moduleDir; -} - -/// notifyObjectCompiled - Provides a pointer to compiled code for Module M. -void JitCache::notifyObjectCompiled(const llvm::Module* M, llvm::MemoryBufferRef Obj) -{ - const std::string& moduleID = M->getModuleIdentifier(); - if (!moduleID.length()) - { - return; - } - - if (!mModuleCacheDir.size()) - { - SWR_INVALID("Unset module cache directory"); - return; - } - - if (!llvm::sys::fs::exists(mModuleCacheDir.str()) && - llvm::sys::fs::create_directories(mModuleCacheDir.str())) - { - SWR_INVALID("Unable to create directory: %s", mModuleCacheDir.c_str()); - return; - } - - JitCacheFileHeader header; - - llvm::SmallString<MAX_PATH> filePath = mModuleCacheDir; - llvm::sys::path::append(filePath, moduleID); - - llvm::SmallString<MAX_PATH> objPath = filePath; - objPath += JIT_OBJ_EXT; - - { - std::error_code err; - llvm::raw_fd_ostream fileObj(objPath.c_str(), err, llvm::sys::fs::F_None); - fileObj << Obj.getBuffer(); - fileObj.flush(); - } - - - { - std::error_code err; - llvm::raw_fd_ostream fileObj(filePath.c_str(), err, llvm::sys::fs::F_None); - - uint32_t objcrc = ComputeCRC(0, Obj.getBufferStart(), Obj.getBufferSize()); - - header.Init(mCurrentModuleCRC, objcrc, moduleID, mCpu, mOptLevel, Obj.getBufferSize()); - - fileObj.write((const char*)&header, sizeof(header)); - fileObj.flush(); - } -} - -/// Returns a pointer to a newly allocated MemoryBuffer that contains the -/// object which corresponds with Module M, or 0 if an object is not -/// available. -std::unique_ptr<llvm::MemoryBuffer> JitCache::getObject(const llvm::Module* M) -{ - const std::string& moduleID = M->getModuleIdentifier(); - mCurrentModuleCRC = ComputeModuleCRC(M); - - if (!moduleID.length()) - { - return nullptr; - } - - CalcModuleCacheDir(); - - if (!llvm::sys::fs::exists(mModuleCacheDir)) - { - return nullptr; - } - - llvm::SmallString<MAX_PATH> filePath = mModuleCacheDir; - llvm::sys::path::append(filePath, moduleID); - - llvm::SmallString<MAX_PATH> objFilePath = filePath; - objFilePath += JIT_OBJ_EXT; - - FILE* fpObjIn = nullptr; - FILE* fpIn = fopen(filePath.c_str(), "rb"); - if (!fpIn) - { - return nullptr; - } - - std::unique_ptr<llvm::MemoryBuffer> pBuf = nullptr; - do - { - JitCacheFileHeader header; - if (!fread(&header, sizeof(header), 1, fpIn)) - { - break; - } - - if (!header.IsValid(mCurrentModuleCRC, moduleID, mCpu, mOptLevel)) - { - break; - } - - fpObjIn = fopen(objFilePath.c_str(), "rb"); - if (!fpObjIn) - { - break; - } - -#if LLVM_VERSION_MAJOR < 6 - pBuf = llvm::MemoryBuffer::getNewUninitMemBuffer(size_t(header.GetObjectSize())); -#else - pBuf = llvm::WritableMemoryBuffer::getNewUninitMemBuffer(size_t(header.GetObjectSize())); -#endif - if (!fread(const_cast<char*>(pBuf->getBufferStart()), header.GetObjectSize(), 1, fpObjIn)) - { - pBuf = nullptr; - break; - } - - if (header.GetObjectCRC() != ComputeCRC(0, pBuf->getBufferStart(), pBuf->getBufferSize())) - { - SWR_TRACE("Invalid object cache file, ignoring: %s", filePath.c_str()); - pBuf = nullptr; - break; - } - - } while (0); - - fclose(fpIn); - - if (fpObjIn) - { - fclose(fpObjIn); - } - - - return pBuf; -} - -void InterleaveAssemblyAnnotater::emitInstructionAnnot(const llvm::Instruction* pInst, - llvm::formatted_raw_ostream& OS) -{ - auto dbgLoc = pInst->getDebugLoc(); - if (dbgLoc) - { - unsigned int line = dbgLoc.getLine(); - if (line != mCurrentLineNo) - { - if (line > 0 && line <= mAssembly.size()) - { - // HACK: here we assume that OS is a formatted_raw_ostream(ods()) - // and modify the color accordingly. We can't do the color - // modification on OS because formatted_raw_ostream strips - // the color information. The only way to fix this behavior - // is to patch LLVM. - OS << "\n; " << line << ": " << mAssembly[line - 1] << "\n"; - } - mCurrentLineNo = line; - } - } -} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h deleted file mode 100644 index d96d22e1b95..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +++ /dev/null @@ -1,212 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file JitManager.h - * - * @brief JitManager contains the LLVM data structures used for JIT generation - * - * Notes: - * - ******************************************************************************/ -#pragma once - -#include "jit_pch.hpp" -#include "common/isa.hpp" -#include <llvm/IR/AssemblyAnnotationWriter.h> - - -////////////////////////////////////////////////////////////////////////// -/// JitInstructionSet -/// @brief Subclass of InstructionSet that allows users to override -/// the reporting of support for certain ISA features. This allows capping -/// the jitted code to a certain feature level, e.g. jit AVX level code on -/// a platform that supports AVX2. -////////////////////////////////////////////////////////////////////////// -class JitInstructionSet : public InstructionSet -{ -public: - JitInstructionSet(const char* requestedIsa) : isaRequest(requestedIsa) - { - std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower); - - if (isaRequest == "avx") - { - bForceAVX = true; - bForceAVX2 = false; - bForceAVX512 = false; - } - else if (isaRequest == "avx2") - { - bForceAVX = false; - bForceAVX2 = true; - bForceAVX512 = false; - } - else if (isaRequest == "avx512") - { - bForceAVX = false; - bForceAVX2 = false; - bForceAVX512 = true; - } - }; - - bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); } - bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512F(); } - bool AVX512ER(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512ER(); } - bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); } - -private: - bool bForceAVX = false; - bool bForceAVX2 = false; - bool bForceAVX512 = false; - std::string isaRequest; -}; - -struct JitLLVMContext : llvm::LLVMContext -{ -}; - -////////////////////////////////////////////////////////////////////////// -/// JitCache -////////////////////////////////////////////////////////////////////////// -struct JitManager; // Forward Decl -class JitCache : public llvm::ObjectCache -{ -public: - /// constructor - JitCache(); - virtual ~JitCache() {} - - void Init(JitManager* pJitMgr, const llvm::StringRef& cpu, llvm::CodeGenOpt::Level level) - { - mCpu = cpu.str(); - mpJitMgr = pJitMgr; - mOptLevel = level; - } - - /// notifyObjectCompiled - Provides a pointer to compiled code for Module M. - void notifyObjectCompiled(const llvm::Module* M, llvm::MemoryBufferRef Obj) override; - - /// Returns a pointer to a newly allocated MemoryBuffer that contains the - /// object which corresponds with Module M, or 0 if an object is not - /// available. - std::unique_ptr<llvm::MemoryBuffer> getObject(const llvm::Module* M) override; - - const char* GetModuleCacheDir() { return mModuleCacheDir.c_str(); } - -private: - std::string mCpu; - llvm::SmallString<MAX_PATH> mCacheDir; - llvm::SmallString<MAX_PATH> mModuleCacheDir; - uint32_t mCurrentModuleCRC = 0; - JitManager* mpJitMgr = nullptr; - llvm::CodeGenOpt::Level mOptLevel = llvm::CodeGenOpt::None; - - /// Calculate actual directory where module will be cached. - /// This is always a subdirectory of mCacheDir. Full absolute - /// path name will be stored in mCurrentModuleCacheDir - void CalcModuleCacheDir(); -}; - -////////////////////////////////////////////////////////////////////////// -/// JitManager -////////////////////////////////////////////////////////////////////////// -struct JitManager -{ - JitManager(uint32_t w, const char* arch, const char* core); - ~JitManager() - { - for (auto* pExec : mvExecEngines) - { - delete pExec; - } - } - - JitLLVMContext mContext; ///< LLVM compiler - llvm::IRBuilder<> mBuilder; ///< LLVM IR Builder - llvm::ExecutionEngine* mpExec; - std::vector<llvm::ExecutionEngine*> mvExecEngines; - JitCache mCache; - llvm::StringRef mHostCpuName; - llvm::CodeGenOpt::Level mOptLevel; - - // Need to be rebuilt after a JIT and before building new IR - llvm::Module* mpCurrentModule; - bool mIsModuleFinalized; - uint32_t mJitNumber; - - uint32_t mVWidth; - - bool mUsingAVX512 = false; - - // fetch shader types - llvm::FunctionType* mFetchShaderTy; - - JitInstructionSet mArch; - - // Debugging support - std::unordered_map<llvm::StructType*, llvm::DIType*> mDebugStructMap; - - void CreateExecEngine(std::unique_ptr<llvm::Module> M); - void SetupNewModule(); - - void DumpAsm(llvm::Function* pFunction, const char* fileName); - static void DumpToFile(llvm::Function* f, const char* fileName); - static void DumpToFile(llvm::Module* M, - const char* fileName, - llvm::AssemblyAnnotationWriter* annotater = nullptr); - static std::string GetOutputDir(); - - // Debugging support methods - llvm::DIType* GetDebugType(llvm::Type* pTy); - llvm::DIType* GetDebugIntegerType(llvm::Type* pTy); - llvm::DIType* GetDebugArrayType(llvm::Type* pTy); - llvm::DIType* GetDebugVectorType(llvm::Type* pTy); - llvm::DIType* GetDebugFunctionType(llvm::Type* pTy); - - llvm::DIType* GetDebugStructType(llvm::Type* pType) - { - llvm::StructType* pStructTy = llvm::cast<llvm::StructType>(pType); - if (mDebugStructMap.find(pStructTy) == mDebugStructMap.end()) - { - return nullptr; - } - return mDebugStructMap[pStructTy]; - } - - llvm::DIType* - CreateDebugStructType(llvm::StructType* pType, - const std::string& name, - llvm::DIFile* pFile, - uint32_t lineNum, - const std::vector<std::pair<std::string, uint32_t>>& members); -}; - -class InterleaveAssemblyAnnotater : public llvm::AssemblyAnnotationWriter -{ -public: - void emitInstructionAnnot(const llvm::Instruction* pInst, - llvm::formatted_raw_ostream& OS) override; - std::vector<std::string> mAssembly; - -private: - uint32_t mCurrentLineNo = 0; -}; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp deleted file mode 100644 index 80959809806..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +++ /dev/null @@ -1,924 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file blend_jit.cpp - * - * @brief Implementation of the blend jitter - * - * Notes: - * - ******************************************************************************/ -#include "jit_pch.hpp" -#include "builder.h" -#include "jit_api.h" -#include "blend_jit.h" -#include "gen_state_llvm.h" -#include "functionpasses/passes.h" - -#include "util/compiler.h" - -// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized -#define QUANTIZE_THRESHOLD 2 - -using namespace llvm; -using namespace SwrJit; - -////////////////////////////////////////////////////////////////////////// -/// Interface to Jitting a blend shader -////////////////////////////////////////////////////////////////////////// -struct BlendJit : public Builder -{ - BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){}; - - template <bool Color, bool Alpha> - void GenerateBlendFactor(SWR_BLEND_FACTOR factor, - Value* constColor[4], - Value* src[4], - Value* src1[4], - Value* dst[4], - Value* result[4]) - { - Value* out[4]; - - switch (factor) - { - case BLENDFACTOR_ONE: - out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f); - break; - case BLENDFACTOR_SRC_COLOR: - out[0] = src[0]; - out[1] = src[1]; - out[2] = src[2]; - out[3] = src[3]; - break; - case BLENDFACTOR_SRC_ALPHA: - out[0] = out[1] = out[2] = out[3] = src[3]; - break; - case BLENDFACTOR_DST_ALPHA: - out[0] = out[1] = out[2] = out[3] = dst[3]; - break; - case BLENDFACTOR_DST_COLOR: - out[0] = dst[0]; - out[1] = dst[1]; - out[2] = dst[2]; - out[3] = dst[3]; - break; - case BLENDFACTOR_SRC_ALPHA_SATURATE: - out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3])); - out[3] = VIMMED1(1.0f); - break; - case BLENDFACTOR_CONST_COLOR: - out[0] = constColor[0]; - out[1] = constColor[1]; - out[2] = constColor[2]; - out[3] = constColor[3]; - break; - case BLENDFACTOR_CONST_ALPHA: - out[0] = out[1] = out[2] = out[3] = constColor[3]; - break; - case BLENDFACTOR_SRC1_COLOR: - out[0] = src1[0]; - out[1] = src1[1]; - out[2] = src1[2]; - out[3] = src1[3]; - break; - case BLENDFACTOR_SRC1_ALPHA: - out[0] = out[1] = out[2] = out[3] = src1[3]; - break; - case BLENDFACTOR_ZERO: - out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); - break; - case BLENDFACTOR_INV_SRC_COLOR: - out[0] = FSUB(VIMMED1(1.0f), src[0]); - out[1] = FSUB(VIMMED1(1.0f), src[1]); - out[2] = FSUB(VIMMED1(1.0f), src[2]); - out[3] = FSUB(VIMMED1(1.0f), src[3]); - break; - case BLENDFACTOR_INV_SRC_ALPHA: - out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]); - break; - case BLENDFACTOR_INV_DST_ALPHA: - out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]); - break; - case BLENDFACTOR_INV_DST_COLOR: - out[0] = FSUB(VIMMED1(1.0f), dst[0]); - out[1] = FSUB(VIMMED1(1.0f), dst[1]); - out[2] = FSUB(VIMMED1(1.0f), dst[2]); - out[3] = FSUB(VIMMED1(1.0f), dst[3]); - break; - case BLENDFACTOR_INV_CONST_COLOR: - out[0] = FSUB(VIMMED1(1.0f), constColor[0]); - out[1] = FSUB(VIMMED1(1.0f), constColor[1]); - out[2] = FSUB(VIMMED1(1.0f), constColor[2]); - out[3] = FSUB(VIMMED1(1.0f), constColor[3]); - break; - case BLENDFACTOR_INV_CONST_ALPHA: - out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]); - break; - case BLENDFACTOR_INV_SRC1_COLOR: - out[0] = FSUB(VIMMED1(1.0f), src1[0]); - out[1] = FSUB(VIMMED1(1.0f), src1[1]); - out[2] = FSUB(VIMMED1(1.0f), src1[2]); - out[3] = FSUB(VIMMED1(1.0f), src1[3]); - break; - case BLENDFACTOR_INV_SRC1_ALPHA: - out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]); - break; - default: - SWR_INVALID("Unsupported blend factor: %d", factor); - out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); - break; - } - - if (Color) - { - result[0] = out[0]; - result[1] = out[1]; - result[2] = out[2]; - } - - if (Alpha) - { - result[3] = out[3]; - } - } - - void Clamp(SWR_FORMAT format, Value* src[4]) - { - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - SWR_TYPE type = info.type[0]; - - switch (type) - { - default: - break; - - case SWR_TYPE_UNORM: - src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f)); - src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f)); - src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f)); - src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f)); - break; - - case SWR_TYPE_SNORM: - src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f)); - src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f)); - src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f)); - src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f)); - break; - - case SWR_TYPE_UNKNOWN: - SWR_INVALID("Unsupported format type: %d", type); - } - } - - void ApplyDefaults(SWR_FORMAT format, Value* src[4]) - { - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - - bool valid[] = {false, false, false, false}; - for (uint32_t c = 0; c < info.numComps; ++c) - { - valid[info.swizzle[c]] = true; - } - - for (uint32_t c = 0; c < 4; ++c) - { - if (!valid[c]) - { - src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty); - } - } - } - - void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4]) - { - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - - for (uint32_t c = 0; c < info.numComps; ++c) - { - if (info.type[c] == SWR_TYPE_UNUSED) - { - src[info.swizzle[c]] = - BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty); - } - } - } - - void Quantize(SWR_FORMAT format, Value* src[4]) - { - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - for (uint32_t c = 0; c < info.numComps; ++c) - { - if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED) - { - uint32_t swizComp = info.swizzle[c]; - float factor = (float)((1 << info.bpc[c]) - 1); - switch (info.type[c]) - { - case SWR_TYPE_UNORM: - src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f)); - src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO)); - src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor)); - break; - default: - SWR_INVALID("Unsupported format type: %d", info.type[c]); - } - } - } - } - - template <bool Color, bool Alpha> - void BlendFunc(SWR_BLEND_OP blendOp, - Value* src[4], - Value* srcFactor[4], - Value* dst[4], - Value* dstFactor[4], - Value* result[4]) - { - Value* out[4]; - Value* srcBlend[4]; - Value* dstBlend[4]; - for (uint32_t i = 0; i < 4; ++i) - { - srcBlend[i] = FMUL(src[i], srcFactor[i]); - dstBlend[i] = FMUL(dst[i], dstFactor[i]); - } - - switch (blendOp) - { - case BLENDOP_ADD: - out[0] = FADD(srcBlend[0], dstBlend[0]); - out[1] = FADD(srcBlend[1], dstBlend[1]); - out[2] = FADD(srcBlend[2], dstBlend[2]); - out[3] = FADD(srcBlend[3], dstBlend[3]); - break; - - case BLENDOP_SUBTRACT: - out[0] = FSUB(srcBlend[0], dstBlend[0]); - out[1] = FSUB(srcBlend[1], dstBlend[1]); - out[2] = FSUB(srcBlend[2], dstBlend[2]); - out[3] = FSUB(srcBlend[3], dstBlend[3]); - break; - - case BLENDOP_REVSUBTRACT: - out[0] = FSUB(dstBlend[0], srcBlend[0]); - out[1] = FSUB(dstBlend[1], srcBlend[1]); - out[2] = FSUB(dstBlend[2], srcBlend[2]); - out[3] = FSUB(dstBlend[3], srcBlend[3]); - break; - - case BLENDOP_MIN: - out[0] = VMINPS(src[0], dst[0]); - out[1] = VMINPS(src[1], dst[1]); - out[2] = VMINPS(src[2], dst[2]); - out[3] = VMINPS(src[3], dst[3]); - break; - - case BLENDOP_MAX: - out[0] = VMAXPS(src[0], dst[0]); - out[1] = VMAXPS(src[1], dst[1]); - out[2] = VMAXPS(src[2], dst[2]); - out[3] = VMAXPS(src[3], dst[3]); - break; - - default: - SWR_INVALID("Unsupported blend operation: %d", blendOp); - out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); - break; - } - - if (Color) - { - result[0] = out[0]; - result[1] = out[1]; - result[2] = out[2]; - } - - if (Alpha) - { - result[3] = out[3]; - } - } - - void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4]) - { - // Op: (s == PS output, d = RT contents) - switch (logicOp) - { - case LOGICOP_CLEAR: - result[0] = VIMMED1(0); - result[1] = VIMMED1(0); - result[2] = VIMMED1(0); - result[3] = VIMMED1(0); - break; - - case LOGICOP_NOR: - // ~(s | d) - result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); - result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); - result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); - result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); - break; - - case LOGICOP_AND_INVERTED: - // ~s & d - // todo: use avx andnot instr when I can find the intrinsic to call - result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]); - result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]); - result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]); - result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]); - break; - - case LOGICOP_COPY_INVERTED: - // ~s - result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF)); - result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF)); - result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF)); - result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF)); - break; - - case LOGICOP_AND_REVERSE: - // s & ~d - // todo: use avx andnot instr when I can find the intrinsic to call - result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]); - result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]); - result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]); - result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]); - break; - - case LOGICOP_INVERT: - // ~d - result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF)); - result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF)); - result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF)); - result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF)); - break; - - case LOGICOP_XOR: - // s ^ d - result[0] = XOR(src[0], dst[0]); - result[1] = XOR(src[1], dst[1]); - result[2] = XOR(src[2], dst[2]); - result[3] = XOR(src[3], dst[3]); - break; - - case LOGICOP_NAND: - // ~(s & d) - result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); - result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); - result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); - result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); - break; - - case LOGICOP_AND: - // s & d - result[0] = AND(src[0], dst[0]); - result[1] = AND(src[1], dst[1]); - result[2] = AND(src[2], dst[2]); - result[3] = AND(src[3], dst[3]); - break; - - case LOGICOP_EQUIV: - // ~(s ^ d) - result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); - result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); - result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); - result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); - break; - - case LOGICOP_NOOP: - result[0] = dst[0]; - result[1] = dst[1]; - result[2] = dst[2]; - result[3] = dst[3]; - break; - - case LOGICOP_OR_INVERTED: - // ~s | d - result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]); - result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]); - result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]); - result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]); - break; - - case LOGICOP_COPY: - result[0] = src[0]; - result[1] = src[1]; - result[2] = src[2]; - result[3] = src[3]; - break; - - case LOGICOP_OR_REVERSE: - // s | ~d - result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]); - result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]); - result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]); - result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]); - break; - - case LOGICOP_OR: - // s | d - result[0] = OR(src[0], dst[0]); - result[1] = OR(src[1], dst[1]); - result[2] = OR(src[2], dst[2]); - result[3] = OR(src[3], dst[3]); - break; - - case LOGICOP_SET: - result[0] = VIMMED1(0xFFFFFFFF); - result[1] = VIMMED1(0xFFFFFFFF); - result[2] = VIMMED1(0xFFFFFFFF); - result[3] = VIMMED1(0xFFFFFFFF); - break; - - default: - SWR_INVALID("Unsupported logic operation: %d", logicOp); - result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f); - break; - } - } - - void - AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask) - { - // load uint32_t reference - Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference})); - - // load alpha - Value* pAlpha = LOAD(ppAlpha, {0, 0}); - - Value* pTest = nullptr; - if (state.alphaTestFormat == ALPHA_TEST_UNORM8) - { - // convert float alpha to unorm8 - Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f)); - pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty); - - // compare - switch (state.alphaTestFunction) - { - case ZFUNC_ALWAYS: - pTest = VIMMED1(true); - break; - case ZFUNC_NEVER: - pTest = VIMMED1(false); - break; - case ZFUNC_LT: - pTest = ICMP_ULT(pAlphaU8, pRef); - break; - case ZFUNC_EQ: - pTest = ICMP_EQ(pAlphaU8, pRef); - break; - case ZFUNC_LE: - pTest = ICMP_ULE(pAlphaU8, pRef); - break; - case ZFUNC_GT: - pTest = ICMP_UGT(pAlphaU8, pRef); - break; - case ZFUNC_NE: - pTest = ICMP_NE(pAlphaU8, pRef); - break; - case ZFUNC_GE: - pTest = ICMP_UGE(pAlphaU8, pRef); - break; - default: - SWR_INVALID("Invalid alpha test function"); - break; - } - } - else - { - // cast ref to float - pRef = BITCAST(pRef, mSimdFP32Ty); - - // compare - switch (state.alphaTestFunction) - { - case ZFUNC_ALWAYS: - pTest = VIMMED1(true); - break; - case ZFUNC_NEVER: - pTest = VIMMED1(false); - break; - case ZFUNC_LT: - pTest = FCMP_OLT(pAlpha, pRef); - break; - case ZFUNC_EQ: - pTest = FCMP_OEQ(pAlpha, pRef); - break; - case ZFUNC_LE: - pTest = FCMP_OLE(pAlpha, pRef); - break; - case ZFUNC_GT: - pTest = FCMP_OGT(pAlpha, pRef); - break; - case ZFUNC_NE: - pTest = FCMP_ONE(pAlpha, pRef); - break; - case ZFUNC_GE: - pTest = FCMP_OGE(pAlpha, pRef); - break; - default: - SWR_INVALID("Invalid alpha test function"); - break; - } - } - - // load current mask - Value* pMask = LOAD(ppMask); - - // convert to int1 mask - pMask = MASK(pMask); - - // and with alpha test result - pMask = AND(pMask, pTest); - - // convert back to vector mask - pMask = VMASK(pMask); - - // store new mask - STORE(pMask, ppMask); - } - - Function* Create(const BLEND_COMPILE_STATE& state) - { - std::stringstream fnName("BLND_", - std::ios_base::in | std::ios_base::out | std::ios_base::ate); - fnName << ComputeCRC(0, &state, sizeof(state)); - - // blend function signature - // typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*); - - std::vector<Type*> args{ - PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT* - }; - - // std::vector<Type*> args{ - // PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT* - //}; - - FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); - Function* blendFunc = Function::Create( - fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); - blendFunc->getParent()->setModuleIdentifier(blendFunc->getName()); - - BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc); - - IRB()->SetInsertPoint(entry); - - // arguments - auto argitr = blendFunc->arg_begin(); - Value* pBlendContext = &*argitr++; - pBlendContext->setName("pBlendContext"); - Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState}); - pBlendState->setName("pBlendState"); - Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src}); - pSrc->setName("src"); - Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1}); - pSrc1->setName("src1"); - Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha}); - pSrc0Alpha->setName("src0alpha"); - Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum}); - sampleNum->setName("sampleNum"); - Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst}); - pDst->setName("pDst"); - Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result}); - pResult->setName("result"); - Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask}); - ppoMask->setName("ppoMask"); - Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask}); - ppMask->setName("pMask"); - - static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, - "Unsupported hot tile format"); - Value* dst[4]; - Value* constantColor[4]; - Value* src[4]; - Value* src1[4]; - Value* result[4]; - for (uint32_t i = 0; i < 4; ++i) - { - // load hot tile - dst[i] = LOAD(pDst, {0, i}); - - // load constant color - constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i})); - - // load src - src[i] = LOAD(pSrc, {0, i}); - - // load src1 - src1[i] = LOAD(pSrc1, {0, i}); - } - Value* currentSampleMask = VIMMED1(-1); - if (state.desc.alphaToCoverageEnable) - { - Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f); - uint32_t bits = (1 << state.desc.numSamples) - 1; - currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); - currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty); - } - - // alpha test - if (state.desc.alphaTestEnable) - { - // Gather for archrast stats - STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested}); - AlphaTest(state, pBlendState, pSrc0Alpha, ppMask); - } - else - { - // Gather for archrast stats - STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested}); - } - - // color blend - if (state.blendState.blendEnable) - { - // Gather for archrast stats - STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended}); - - // clamp sources - Clamp(state.format, src); - Clamp(state.format, src1); - Clamp(state.format, dst); - Clamp(state.format, constantColor); - - // apply defaults to hottile contents to take into account missing components - ApplyDefaults(state.format, dst); - - // Force defaults for unused 'X' components - ApplyUnusedDefaults(state.format, dst); - - // Quantize low precision components - Quantize(state.format, dst); - - // special case clamping for R11G11B10_float which has no sign bit - if (state.format == R11G11B10_FLOAT) - { - dst[0] = VMAXPS(dst[0], VIMMED1(0.0f)); - dst[1] = VMAXPS(dst[1], VIMMED1(0.0f)); - dst[2] = VMAXPS(dst[2], VIMMED1(0.0f)); - dst[3] = VMAXPS(dst[3], VIMMED1(0.0f)); - } - - Value* srcFactor[4]; - Value* dstFactor[4]; - if (state.desc.independentAlphaBlendEnable) - { - GenerateBlendFactor<true, false>( - state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); - GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, - constantColor, - src, - src1, - dst, - srcFactor); - - GenerateBlendFactor<true, false>( - state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); - GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, - constantColor, - src, - src1, - dst, - dstFactor); - - BlendFunc<true, false>( - state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); - BlendFunc<false, true>( - state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result); - } - else - { - GenerateBlendFactor<true, true>( - state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); - GenerateBlendFactor<true, true>( - state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); - - BlendFunc<true, true>( - state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); - } - - // store results out - for (uint32_t i = 0; i < 4; ++i) - { - STORE(result[i], pResult, {0, i}); - } - } - else - { - // Gather for archrast stats - STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended}); - } - - if (state.blendState.logicOpEnable) - { - const SWR_FORMAT_INFO& info = GetFormatInfo(state.format); - Value* vMask[4]; - float scale[4]; - - if (!state.blendState.blendEnable) - { - Clamp(state.format, src); - Clamp(state.format, dst); - } - - for (uint32_t i = 0; i < 4; i++) - { - if (info.type[i] == SWR_TYPE_UNUSED) - { - continue; - } - - if (info.bpc[i] >= 32) - { - vMask[i] = VIMMED1(0xFFFFFFFF); - scale[i] = 0xFFFFFFFF; - } - else - { - vMask[i] = VIMMED1((1 << info.bpc[i]) - 1); - if (info.type[i] == SWR_TYPE_SNORM) - scale[i] = (1 << (info.bpc[i] - 1)) - 1; - else - scale[i] = (1 << info.bpc[i]) - 1; - } - - switch (info.type[i]) - { - default: - SWR_INVALID("Unsupported type for logic op: %d", info.type[i]); - break; - - case SWR_TYPE_UNKNOWN: - case SWR_TYPE_UNUSED: - FALLTHROUGH; - - case SWR_TYPE_UINT: - case SWR_TYPE_SINT: - src[i] = BITCAST(src[i], mSimdInt32Ty); - dst[i] = BITCAST(dst[i], mSimdInt32Ty); - break; - case SWR_TYPE_SNORM: - src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty); - dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty); - break; - case SWR_TYPE_UNORM: - src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty); - dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty); - break; - } - } - - LogicOpFunc(state.blendState.logicOpFunc, src, dst, result); - - // store results out - for (uint32_t i = 0; i < 4; ++i) - { - if (info.type[i] == SWR_TYPE_UNUSED) - { - continue; - } - - // clear upper bits from PS output not in RT format after doing logic op - result[i] = AND(result[i], vMask[i]); - - switch (info.type[i]) - { - default: - SWR_INVALID("Unsupported type for logic op: %d", info.type[i]); - break; - - case SWR_TYPE_UNKNOWN: - case SWR_TYPE_UNUSED: - FALLTHROUGH; - - case SWR_TYPE_UINT: - case SWR_TYPE_SINT: - result[i] = BITCAST(result[i], mSimdFP32Ty); - break; - case SWR_TYPE_SNORM: - result[i] = SHL(result[i], C(32 - info.bpc[i])); - result[i] = ASHR(result[i], C(32 - info.bpc[i])); - result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i])); - break; - case SWR_TYPE_UNORM: - result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i])); - break; - } - - STORE(result[i], pResult, {0, i}); - } - } - - if (state.desc.oMaskEnable) - { - assert(!(state.desc.alphaToCoverageEnable)); - // load current mask - Value* oMask = LOAD(ppoMask); - currentSampleMask = AND(oMask, currentSampleMask); - } - - if (state.desc.sampleMaskEnable) - { - Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask}); - currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask); - } - - if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || - state.desc.oMaskEnable) - { - // load coverage mask and mask off any lanes with no samples - Value* pMask = LOAD(ppMask); - Value* sampleMasked = SHL(C(1), sampleNum); - currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked)); - currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty); - Value* outputMask = AND(pMask, currentSampleMask); - // store new mask - STORE(outputMask, GEP(ppMask, C(0))); - } - - RET_VOID(); - - JitManager::DumpToFile(blendFunc, ""); - - ::FunctionPassManager passes(JM()->mpCurrentModule); - - passes.add(createBreakCriticalEdgesPass()); - passes.add(createCFGSimplificationPass()); - passes.add(createEarlyCSEPass()); - passes.add(createPromoteMemoryToRegisterPass()); - passes.add(createCFGSimplificationPass()); - passes.add(createEarlyCSEPass()); - passes.add(createInstructionCombiningPass()); -#if LLVM_VERSION_MAJOR <= 11 - passes.add(createConstantPropagationPass()); -#endif - passes.add(createSCCPPass()); - passes.add(createAggressiveDCEPass()); - - passes.add(createLowerX86Pass(this)); - - passes.run(*blendFunc); - - JitManager::DumpToFile(blendFunc, "optimized"); - - return blendFunc; - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief JITs from fetch shader IR -/// @param hJitMgr - JitManager handle -/// @param func - LLVM function IR -/// @return PFN_FETCH_FUNC - pointer to fetch code -PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc) -{ - const llvm::Function* func = (const llvm::Function*)hFunc; - JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); - PFN_BLEND_JIT_FUNC pfnBlend; - pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); - // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot - // add new IR to the module - pJitMgr->mIsModuleFinalized = true; - - return pfnBlend; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief JIT compiles blend shader -/// @param hJitMgr - JitManager handle -/// @param state - blend state to build function from -extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, - const BLEND_COMPILE_STATE& state) -{ - JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); - - pJitMgr->SetupNewModule(); - - BlendJit theJit(pJitMgr); - HANDLE hFunc = theJit.Create(state); - - return JitBlendFunc(hJitMgr, hFunc); -} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h deleted file mode 100644 index 3e78054eced..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h +++ /dev/null @@ -1,129 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file blend_jit.h - * - * @brief Definition of the blend jitter - * - * Notes: - * - ******************************************************************************/ -#pragma once - -#include "common/formats.h" -#include "core/state.h" - -struct RENDER_TARGET_BLEND_COMPILE_STATE -{ - bool blendEnable; - bool logicOpEnable; - SWR_BLEND_FACTOR sourceAlphaBlendFactor; - SWR_BLEND_FACTOR destAlphaBlendFactor; - SWR_BLEND_FACTOR sourceBlendFactor; - SWR_BLEND_FACTOR destBlendFactor; - SWR_BLEND_OP colorBlendFunc; - SWR_BLEND_OP alphaBlendFunc; - SWR_LOGIC_OP logicOpFunc; -}; - -enum ALPHA_TEST_FORMAT -{ - ALPHA_TEST_UNORM8, - ALPHA_TEST_FLOAT32 -}; - -////////////////////////////////////////////////////////////////////////// -/// BLEND_DESC -////////////////////////////////////////////////////////////////////////// -struct BLEND_DESC -{ - union - { - struct - { - uint32_t alphaTestEnable : 1; - uint32_t independentAlphaBlendEnable : 1; - uint32_t alphaToCoverageEnable : 1; - uint32_t oMaskEnable : 1; - uint32_t inputCoverageEnable : 1; - uint32_t sampleMaskEnable : 1; - uint32_t numSamples : 5; - uint32_t _reserved : 21; - }; - uint32_t bits; - }; -}; -#define BLEND_ENABLE_MASK 0x3D // a2c | oMaskEnable | inputCoverageEnable | sampleMaskEnable -////////////////////////////////////////////////////////////////////////// -/// State required for blend jit -////////////////////////////////////////////////////////////////////////// -struct BLEND_COMPILE_STATE -{ - SWR_FORMAT format; // format of render target being blended - RENDER_TARGET_BLEND_COMPILE_STATE blendState; - BLEND_DESC desc; - - SWR_ZFUNCTION alphaTestFunction; - ALPHA_TEST_FORMAT alphaTestFormat; - - bool operator==(const BLEND_COMPILE_STATE& other) const - { - return memcmp(this, &other, sizeof(BLEND_COMPILE_STATE)) == 0; - } - - // Canonicalize state to reduce unnecessary JIT compiles - void Canonicalize() - { - if (!desc.alphaTestEnable) - { - alphaTestFormat = (ALPHA_TEST_FORMAT)0; - alphaTestFunction = (SWR_ZFUNCTION)0; - } - - if (!blendState.blendEnable) - { - blendState.sourceAlphaBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.destAlphaBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.sourceBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.destBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.colorBlendFunc = (SWR_BLEND_OP)0; - blendState.alphaBlendFunc = (SWR_BLEND_OP)0; - } - - if (!blendState.logicOpEnable) - { - blendState.logicOpFunc = (SWR_LOGIC_OP)0; - } - - if (!blendState.blendEnable && !blendState.logicOpEnable) - { - format = (SWR_FORMAT)0; - } - - if (!desc.independentAlphaBlendEnable) - { - blendState.sourceAlphaBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.destAlphaBlendFactor = (SWR_BLEND_FACTOR)0; - blendState.alphaBlendFunc = (SWR_BLEND_OP)0; - } - } -}; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp deleted file mode 100644 index cd4b5f31ea3..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ /dev/null @@ -1,219 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file builder.h - * - * @brief Includes all the builder related functionality - * - * Notes: - * - ******************************************************************************/ - -#include "jit_pch.hpp" -#include "builder.h" - -namespace SwrJit -{ - using namespace llvm; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Contructor for Builder. - /// @param pJitMgr - JitManager which contains modules, function passes, etc. - Builder::Builder(JitManager* pJitMgr) : mpJitMgr(pJitMgr), mpPrivateContext(nullptr) - { - mVWidth = pJitMgr->mVWidth; - mVWidth16 = 16; - - mpIRBuilder = &pJitMgr->mBuilder; - - // Built in types: scalar - - mVoidTy = Type::getVoidTy(pJitMgr->mContext); - mFP16Ty = Type::getHalfTy(pJitMgr->mContext); - mFP32Ty = Type::getFloatTy(pJitMgr->mContext); - mFP32PtrTy = PointerType::get(mFP32Ty, 0); - mDoubleTy = Type::getDoubleTy(pJitMgr->mContext); - mInt1Ty = Type::getInt1Ty(pJitMgr->mContext); - mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); - mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); - mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); - mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); - mInt8PtrTy = PointerType::get(mInt8Ty, 0); - mInt16PtrTy = PointerType::get(mInt16Ty, 0); - mInt32PtrTy = PointerType::get(mInt32Ty, 0); - mInt64PtrTy = PointerType::get(mInt64Ty, 0); - mHandleTy = mInt8PtrTy; - - mSimd4FP64Ty = getVectorType(mDoubleTy, 4); - - // Built in types: target simd - SetTargetWidth(pJitMgr->mVWidth); - - // Built in types: simd16 - - mSimd16Int1Ty = getVectorType(mInt1Ty, mVWidth16); - mSimd16Int16Ty = getVectorType(mInt16Ty, mVWidth16); - mSimd16Int32Ty = getVectorType(mInt32Ty, mVWidth16); - mSimd16Int64Ty = getVectorType(mInt64Ty, mVWidth16); - mSimd16FP16Ty = getVectorType(mFP16Ty, mVWidth16); - mSimd16FP32Ty = getVectorType(mFP32Ty, mVWidth16); - mSimd16VectorTy = ArrayType::get(mSimd16FP32Ty, 4); - mSimd16VectorTRTy = ArrayType::get(mSimd16FP32Ty, 5); - - mSimd32Int8Ty = getVectorType(mInt8Ty, 32); - - if (sizeof(uint32_t*) == 4) - { - mIntPtrTy = mInt32Ty; - mSimdIntPtrTy = mSimdInt32Ty; - mSimd16IntPtrTy = mSimd16Int32Ty; - } - else - { - SWR_ASSERT(sizeof(uint32_t*) == 8); - - mIntPtrTy = mInt64Ty; - mSimdIntPtrTy = mSimdInt64Ty; - mSimd16IntPtrTy = mSimd16Int64Ty; - } - } - - void Builder::SetTargetWidth(uint32_t width) - { - mVWidth = width; - - mSimdInt1Ty = getVectorType(mInt1Ty, mVWidth); - mSimdInt16Ty = getVectorType(mInt16Ty, mVWidth); - mSimdInt32Ty = getVectorType(mInt32Ty, mVWidth); - mSimdInt64Ty = getVectorType(mInt64Ty, mVWidth); - mSimdFP16Ty = getVectorType(mFP16Ty, mVWidth); - mSimdFP32Ty = getVectorType(mFP32Ty, mVWidth); - mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4); - mSimdVectorIntTy = ArrayType::get(mSimdInt32Ty, 4); - mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5); - mSimdVectorTRIntTy = ArrayType::get(mSimdInt32Ty, 5); - } - - /// @brief Mark this alloca as temporary to avoid hoisting later on - void Builder::SetTempAlloca(Value* inst) - { - AllocaInst* pAlloca = dyn_cast<AllocaInst>(inst); - SWR_ASSERT(pAlloca, "Unexpected non-alloca instruction"); - MDNode* N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, "is_temp_alloca")); - pAlloca->setMetadata("is_temp_alloca", N); - } - - bool Builder::IsTempAlloca(Value* inst) - { - AllocaInst* pAlloca = dyn_cast<AllocaInst>(inst); - SWR_ASSERT(pAlloca, "Unexpected non-alloca instruction"); - - return (pAlloca->getMetadata("is_temp_alloca") != nullptr); - } - - // Returns true if able to find a call instruction to mark - bool Builder::SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName) - { - CallInst* pCallInstr = dyn_cast<CallInst>(inst); - if (pCallInstr) - { - MDNode* N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, mdName)); - pCallInstr->setMetadata(mdName, N); - return true; - } - else - { - // Follow use def chain back up - for (Use& u : inst->operands()) - { - Instruction* srcInst = dyn_cast<Instruction>(u.get()); - if (srcInst) - { - if (SetNamedMetaDataOnCallInstr(srcInst, mdName)) - { - return true; - } - } - } - } - - return false; - } - - bool Builder::HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName) - { - CallInst* pCallInstr = dyn_cast<CallInst>(inst); - - if (!pCallInstr) - { - return false; - } - - return (pCallInstr->getMetadata(mdName) != nullptr); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Packetizes the type. Assumes SOA conversion. - Type* Builder::GetVectorType(Type* pType) - { - if (pType->isVectorTy()) - { - return pType; - } - - // [N x float] should packetize to [N x <8 x float>] - if (pType->isArrayTy()) - { - uint32_t arraySize = pType->getArrayNumElements(); - Type* pArrayType = pType->getArrayElementType(); - Type* pVecArrayType = GetVectorType(pArrayType); - Type* pVecType = ArrayType::get(pVecArrayType, arraySize); - return pVecType; - } - - // {float,int} should packetize to {<8 x float>, <8 x int>} - if (pType->isAggregateType()) - { - uint32_t numElems = pType->getStructNumElements(); - SmallVector<Type*, 8> vecTypes; - for (uint32_t i = 0; i < numElems; ++i) - { - Type* pElemType = pType->getStructElementType(i); - Type* pVecElemType = GetVectorType(pElemType); - vecTypes.push_back(pVecElemType); - } - Type* pVecType = StructType::get(JM()->mContext, vecTypes); - return pVecType; - } - - // [N x float]* should packetize to [N x <8 x float>]* - if (pType->isPointerTy() && pType->getPointerElementType()->isArrayTy()) - { - return PointerType::get(GetVectorType(pType->getPointerElementType()), - pType->getPointerAddressSpace()); - } - - // <ty> should packetize to <8 x <ty>> - Type* vecType = getVectorType(pType, JM()->mVWidth); - return vecType; - } -} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h deleted file mode 100644 index 9f2c199464d..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ /dev/null @@ -1,181 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file builder.h - * - * @brief Includes all the builder related functionality - * - * Notes: - * - ******************************************************************************/ -#pragma once - -#include "JitManager.h" -#include "common/formats.h" - -namespace SwrJit -{ - ///@todo Move this to better place - enum SHADER_STATS_COUNTER_TYPE - { - STATS_INST_EXECUTED = 0, - STATS_SAMPLE_EXECUTED = 1, - STATS_SAMPLE_L_EXECUTED = 2, - STATS_SAMPLE_B_EXECUTED = 3, - STATS_SAMPLE_C_EXECUTED = 4, - STATS_SAMPLE_C_LZ_EXECUTED = 5, - STATS_SAMPLE_C_D_EXECUTED = 6, - STATS_LOD_EXECUTED = 7, - STATS_GATHER4_EXECUTED = 8, - STATS_GATHER4_C_EXECUTED = 9, - STATS_GATHER4_C_PO_EXECUTED = 10, - STATS_GATHER4_C_PO_C_EXECUTED = 11, - STATS_LOAD_RAW_UAV = 12, - STATS_LOAD_RAW_RESOURCE = 13, - STATS_STORE_RAW_UAV = 14, - STATS_STORE_TGSM = 15, - STATS_DISCARD = 16, - STATS_BARRIER = 17, - - // ------------------ - STATS_TOTAL_COUNTERS - }; - - using namespace llvm; - struct Builder - { - Builder(JitManager* pJitMgr); - virtual ~Builder() {} - - IRBuilder<>* IRB() { return mpIRBuilder; }; - JitManager* JM() { return mpJitMgr; } - - JitManager* mpJitMgr; - IRBuilder<>* mpIRBuilder; - - uint32_t mVWidth; // vector width target simd - uint32_t mVWidth16; // vector width simd16 - - // Built in types: scalar - - Type* mVoidTy; - Type* mHandleTy; - Type* mInt1Ty; - Type* mInt8Ty; - Type* mInt16Ty; - Type* mInt32Ty; - Type* mInt64Ty; - Type* mIntPtrTy; - Type* mFP16Ty; - Type* mFP32Ty; - Type* mFP32PtrTy; - Type* mDoubleTy; - Type* mInt8PtrTy; - Type* mInt16PtrTy; - Type* mInt32PtrTy; - Type* mInt64PtrTy; - - Type* mSimd4FP64Ty; - - // Built in types: target SIMD - - Type* mSimdFP16Ty; - Type* mSimdFP32Ty; - Type* mSimdInt1Ty; - Type* mSimdInt16Ty; - Type* mSimdInt32Ty; - Type* mSimdInt64Ty; - Type* mSimdIntPtrTy; - Type* mSimdVectorTy; - Type* mSimdVectorTRTy; - Type* mSimdVectorIntTy; - Type* mSimdVectorTRIntTy; - - // Built in types: simd16 - - Type* mSimd16FP16Ty; - Type* mSimd16FP32Ty; - Type* mSimd16Int1Ty; - Type* mSimd16Int16Ty; - Type* mSimd16Int32Ty; - Type* mSimd16Int64Ty; - Type* mSimd16IntPtrTy; - Type* mSimd16VectorTy; - Type* mSimd16VectorTRTy; - - Type* mSimd32Int8Ty; - - void SetTargetWidth(uint32_t width); - void SetTempAlloca(Value* inst); - bool IsTempAlloca(Value* inst); - bool SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName); - bool HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName); - Type* GetVectorType(Type* pType); - void SetMetadata(StringRef s, uint32_t val) - { - llvm::NamedMDNode* metaData = mpJitMgr->mpCurrentModule->getOrInsertNamedMetadata(s); - Constant* cval = mpIRBuilder->getInt32(val); - llvm::MDNode* mdNode = llvm::MDNode::get(mpJitMgr->mpCurrentModule->getContext(), - llvm::ConstantAsMetadata::get(cval)); - if (metaData->getNumOperands()) - { - metaData->setOperand(0, mdNode); - } - else - { - metaData->addOperand(mdNode); - } - } - uint32_t GetMetadata(StringRef s) - { - NamedMDNode* metaData = mpJitMgr->mpCurrentModule->getNamedMetadata(s); - if (metaData) - { - MDNode* mdNode = metaData->getOperand(0); - Metadata* val = mdNode->getOperand(0); - return mdconst::dyn_extract<ConstantInt>(val)->getZExtValue(); - } - else - { - return 0; - } - } - -#include "gen_builder.hpp" -#include "gen_builder_meta.hpp" -#include "gen_builder_intrin.hpp" -#include "builder_misc.h" -#include "builder_math.h" -#include "builder_mem.h" - - void SetPrivateContext(Value* pPrivateContext) - { - mpPrivateContext = pPrivateContext; - NotifyPrivateContextSet(); - } - virtual void NotifyPrivateContextSet() {} - inline Value* GetPrivateContext() { return mpPrivateContext; } - - private: - Value* mpPrivateContext; - }; -} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp deleted file mode 100644 index b67ffbfa7aa..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp +++ /dev/null @@ -1,396 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file builder_gfx_mem.cpp - * - * @brief Definition of the gfx mem builder - * - * Notes: - * - ******************************************************************************/ -#include "jit_pch.hpp" -#include "builder.h" -#include "common/rdtsc_buckets.h" -#include "builder_gfx_mem.h" - -namespace SwrJit -{ - using namespace llvm; - - BuilderGfxMem::BuilderGfxMem(JitManager* pJitMgr) : Builder(pJitMgr) - { - mpTranslationFuncTy = nullptr; - mpfnTranslateGfxAddressForRead = nullptr; - mpfnTranslateGfxAddressForWrite = nullptr; - mpfnTrackMemAccess = nullptr; - mpParamSimDC = nullptr; - mpWorkerData = nullptr; - - } - - void BuilderGfxMem::NotifyPrivateContextSet() - { - } - - void BuilderGfxMem::AssertGFXMemoryParams(Value* ptr, MEM_CLIENT usage) - { - SWR_ASSERT(!(ptr->getType() == mInt64Ty && usage == MEM_CLIENT::MEM_CLIENT_INTERNAL), - "Internal memory should not be gfxptr_t."); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a masked gather operation in LLVM IR. If not - /// supported on the underlying platform, emulate it with loads - /// @param vSrc - SIMD wide value that will be loaded if mask is invalid - /// @param pBase - Int8* base VB address pointer value - /// @param vIndices - SIMD wide value of VB byte offsets - /// @param vMask - SIMD wide mask that controls whether to access memory or the src values - /// @param scale - value to scale indices by - Value* BuilderGfxMem::GATHERPS(Value* vSrc, - Value* pBase, - Value* vIndices, - Value* vMask, - uint8_t scale, - MEM_CLIENT usage) - { - // address may be coming in as 64bit int now so get the pointer - if (pBase->getType() == mInt64Ty) - { - pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0)); - } - - Value* vGather = Builder::GATHERPS(vSrc, pBase, vIndices, vMask, scale); - return vGather; - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a masked gather operation in LLVM IR. If not - /// supported on the underlying platform, emulate it with loads - /// @param vSrc - SIMD wide value that will be loaded if mask is invalid - /// @param pBase - Int8* base VB address pointer value - /// @param vIndices - SIMD wide value of VB byte offsets - /// @param vMask - SIMD wide mask that controls whether to access memory or the src values - /// @param scale - value to scale indices by - Value* BuilderGfxMem::GATHERDD(Value* vSrc, - Value* pBase, - Value* vIndices, - Value* vMask, - uint8_t scale, - MEM_CLIENT usage) - { - - // address may be coming in as 64bit int now so get the pointer - if (pBase->getType() == mInt64Ty) - { - pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0)); - } - - Value* vGather = Builder::GATHERDD(vSrc, pBase, vIndices, vMask, scale); - return vGather; - } - - void BuilderGfxMem::SCATTERPS( - Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage) - { - - // address may be coming in as 64bit int now so get the pointer - if (pDst->getType() == mInt64Ty) - { - pDst = INT_TO_PTR(pDst, PointerType::get(mInt8Ty, 0)); - } - - Builder::SCATTERPS(pDst, BITCAST(vSrc, mSimdFP32Ty), vOffsets, vMask, usage); - } - - Value* BuilderGfxMem::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset) - { - return ADD(base, offset); - } - - Value* BuilderGfxMem::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name) - { - bool xlate = (Ptr->getType() == mInt64Ty); - if (xlate) - { - Ptr = INT_TO_PTR(Ptr, Ty); - Ptr = Builder::GEP(Ptr, Idx, nullptr, isReadOnly, Name); - Ptr = PTR_TO_INT(Ptr, mInt64Ty); - if (isReadOnly) - { - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead); - } - else - { - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForWrite); - } - } - else - { - Ptr = Builder::GEP(Ptr, Idx, nullptr, isReadOnly, Name); - } - return Ptr; - } - - Value* BuilderGfxMem::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name) - { - bool xlate = (Ptr->getType() == mInt64Ty); - if (xlate) - { - Ptr = INT_TO_PTR(Ptr, Ty); - Ptr = Builder::GEP(Ty, Ptr, Idx, Name); - Ptr = PTR_TO_INT(Ptr, mInt64Ty); - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead); - } - else - { - Ptr = Builder::GEP(Ty, Ptr, Idx, Name); - } - return Ptr; - } - - Value* BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list<Value*>& indexList, Type* Ty) - { - bool xlate = (Ptr->getType() == mInt64Ty); - if (xlate) - { - Ptr = INT_TO_PTR(Ptr, Ty); - Ptr = Builder::GEP(Ptr, indexList); - Ptr = PTR_TO_INT(Ptr, mInt64Ty); - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead); - } - else - { - Ptr = Builder::GEP(Ptr, indexList); - } - return Ptr; - } - - Value* - BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty) - { - bool xlate = (Ptr->getType() == mInt64Ty); - if (xlate) - { - Ptr = INT_TO_PTR(Ptr, Ty); - Ptr = Builder::GEP(Ptr, indexList); - Ptr = PTR_TO_INT(Ptr, mInt64Ty); - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead); - } - else - { - Ptr = Builder::GEP(Ptr, indexList); - } - return Ptr; - } - - Value* BuilderGfxMem::TranslationHelper(Value* Ptr, Type* Ty, Value* pfnTranslateGfxAddress) - { - SWR_ASSERT(!(Ptr->getType() == mInt64Ty && Ty == nullptr), - "Access of GFX pointers must have non-null type specified."); - - // address may be coming in as 64bit int now so get the pointer - if (Ptr->getType() == mInt64Ty) - { - Ptr = INT_TO_PTR(Ptr, Ty); - } - - return Ptr; - } - - void BuilderGfxMem::TrackerHelper(Value* Ptr, Type* Ty, MEM_CLIENT usage, bool isRead) - { -#if defined(KNOB_ENABLE_AR) - if (!KNOB_AR_ENABLE_MEMORY_EVENTS) - { - return; - } - - Value* tmpPtr; - // convert actual pointers to int64. - uint32_t size = 0; - - if (Ptr->getType() == mInt64Ty) - { - DataLayout dataLayout(JM()->mpCurrentModule); - size = (uint32_t)dataLayout.getTypeAllocSize(Ty); - - tmpPtr = Ptr; - } - else - { - DataLayout dataLayout(JM()->mpCurrentModule); - size = (uint32_t)dataLayout.getTypeAllocSize(Ptr->getType()); - - tmpPtr = PTR_TO_INT(Ptr, mInt64Ty); - } - - // There are some shader compile setups where there's no translation functions set up. - // This would be a situation where the accesses are to internal rasterizer memory and won't - // be logged. - // TODO: we may wish to revisit this for URB reads/writes, though. - if (mpfnTrackMemAccess) - { - SWR_ASSERT(mpWorkerData != nullptr); - CALL(mpfnTrackMemAccess, - {mpParamSimDC, - mpWorkerData, - tmpPtr, - C((uint32_t)size), - C((uint8_t)isRead), - C((uint32_t)usage)}); - } -#endif - - return; - } - - LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage) - { - AssertGFXMemoryParams(Ptr, usage); - TrackerHelper(Ptr, Ty, usage, true); - - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead); - return Builder::LOAD(Ptr, Name); - } - - LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage) - { - AssertGFXMemoryParams(Ptr, usage); - TrackerHelper(Ptr, Ty, usage, true); - - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead); - return Builder::LOAD(Ptr, Name); - } - - LoadInst* BuilderGfxMem::LOAD( - Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage) - { - AssertGFXMemoryParams(Ptr, usage); - TrackerHelper(Ptr, Ty, usage, true); - - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead); - return Builder::LOAD(Ptr, isVolatile, Name); - } - - LoadInst* BuilderGfxMem::LOAD(Value* BasePtr, - const std::initializer_list<uint32_t>& offset, - const llvm::Twine& name, - Type* Ty, - MEM_CLIENT usage) - { - AssertGFXMemoryParams(BasePtr, usage); - - bool bNeedTranslation = false; - if (BasePtr->getType() == mInt64Ty) - { - SWR_ASSERT(Ty); - BasePtr = INT_TO_PTR(BasePtr, Ty, name); - bNeedTranslation = true; - } - std::vector<Value*> valIndices; - for (auto i : offset) - { - valIndices.push_back(C(i)); - } - BasePtr = Builder::GEPA(BasePtr, valIndices, name); - if (bNeedTranslation) - { - BasePtr = PTR_TO_INT(BasePtr, mInt64Ty, name); - } - - return LOAD(BasePtr, name, Ty, usage); - } - - CallInst* BuilderGfxMem::MASKED_LOAD(Value* Ptr, - unsigned Align, - Value* Mask, - Value* PassThru, - const Twine& Name, - Type* Ty, - MEM_CLIENT usage) - { - AssertGFXMemoryParams(Ptr, usage); - TrackerHelper(Ptr, Ty, usage, true); - - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead); - return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage); - } - - StoreInst* - BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, MEM_CLIENT usage) - { - AssertGFXMemoryParams(Ptr, usage); - TrackerHelper(Ptr, Ty, usage, false); - - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead); - return Builder::STORE(Val, Ptr, isVolatile, Ty, usage); - } - - StoreInst* BuilderGfxMem::STORE(Value* Val, - Value* BasePtr, - const std::initializer_list<uint32_t>& offset, - Type* Ty, - MEM_CLIENT usage) - { - AssertGFXMemoryParams(BasePtr, usage); - TrackerHelper(BasePtr, Ty, usage, false); - - BasePtr = TranslationHelper(BasePtr, Ty, mpfnTranslateGfxAddressForRead); - return Builder::STORE(Val, BasePtr, offset, Ty, usage); - } - - CallInst* BuilderGfxMem::MASKED_STORE( - Value* Val, Value* Ptr, unsigned Align, Value* Mask, Type* Ty, MEM_CLIENT usage) - { - AssertGFXMemoryParams(Ptr, usage); - - TrackerHelper(Ptr, Ty, usage, false); - - Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead); - return Builder::MASKED_STORE(Val, Ptr, Align, Mask, Ty, usage); - } - - Value* BuilderGfxMem::TranslateGfxAddressForRead(Value* xpGfxAddress, - Type* PtrTy, - const Twine& Name, - MEM_CLIENT /* usage */) - { - if (PtrTy == nullptr) - { - PtrTy = mInt8PtrTy; - } - return INT_TO_PTR(xpGfxAddress, PtrTy, Name); - } - - Value* BuilderGfxMem::TranslateGfxAddressForWrite(Value* xpGfxAddress, - Type* PtrTy, - const Twine& Name, - MEM_CLIENT /* usage */) - { - if (PtrTy == nullptr) - { - PtrTy = mInt8PtrTy; - } - return INT_TO_PTR(xpGfxAddress, PtrTy, Name); - } - -} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h deleted file mode 100644 index c361959b76f..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h +++ /dev/null @@ -1,136 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file builder_gfx_mem.h - * - * @brief Definition of the builder to support different translation types for gfx memory access - * - * Notes: - * - ******************************************************************************/ -#pragma once - -#include "builder.h" - -namespace SwrJit -{ - using namespace llvm; - - class BuilderGfxMem : public Builder - { - public: - BuilderGfxMem(JitManager* pJitMgr); - virtual ~BuilderGfxMem() {} - - virtual Value* GEP(Value* Ptr, Value* Idx, Type* Ty = nullptr, bool isReadOnly = true, const Twine& Name = ""); - virtual Value* GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name = ""); - virtual Value* - GEP(Value* Ptr, const std::initializer_list<Value*>& indexList, Type* Ty = nullptr); - virtual Value* - GEP(Value* Ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty = nullptr); - - virtual LoadInst* LOAD(Value* Ptr, - const char* Name, - Type* Ty = nullptr, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - virtual LoadInst* LOAD(Value* Ptr, - const Twine& Name = "", - Type* Ty = nullptr, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - virtual LoadInst* LOAD(Value* Ptr, - bool isVolatile, - const Twine& Name = "", - Type* Ty = nullptr, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - virtual LoadInst* LOAD(Value* BasePtr, - const std::initializer_list<uint32_t>& offset, - const llvm::Twine& Name = "", - Type* Ty = nullptr, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - - virtual CallInst* MASKED_LOAD(Value* Ptr, - unsigned Align, - Value* Mask, - Value* PassThru = nullptr, - const Twine& Name = "", - Type* Ty = nullptr, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - - virtual StoreInst* STORE(Value *Val, Value *Ptr, bool isVolatile = false, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - - virtual StoreInst* STORE(Value* Val, Value* BasePtr, const std::initializer_list<uint32_t>& offset, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - - virtual CallInst* MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - - virtual Value* GATHERPS(Value* src, - Value* pBase, - Value* indices, - Value* mask, - uint8_t scale = 1, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - virtual Value* GATHERDD(Value* src, - Value* pBase, - Value* indices, - Value* mask, - uint8_t scale = 1, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - - virtual void SCATTERPS(Value* pDst, - Value* vSrc, - Value* vOffsets, - Value* vMask, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - - Value* TranslateGfxAddressForRead(Value* xpGfxAddress, - Type* PtrTy = nullptr, - const Twine& Name = "", - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - Value* TranslateGfxAddressForWrite(Value* xpGfxAddress, - Type* PtrTy = nullptr, - const Twine& Name = "", - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - - protected: - void AssertGFXMemoryParams(Value* ptr, MEM_CLIENT usage); - - virtual void NotifyPrivateContextSet(); - - virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset); - - Value* TranslationHelper(Value* Ptr, Type* Ty, Value* pfnTranslateGfxAddress); - void TrackerHelper(Value* Ptr, Type* Ty, MEM_CLIENT usage, bool isRead); - - FunctionType* GetTranslationFunctionType() { return mpTranslationFuncTy; } - Value* GetTranslationFunctionForRead() { return mpfnTranslateGfxAddressForRead; } - Value* GetTranslationFunctionForWrite() { return mpfnTranslateGfxAddressForWrite; } - Value* GetParamSimDC() { return mpParamSimDC; } - - Value* mpWorkerData; - - private: - FunctionType* mpTranslationFuncTy; - Value* mpfnTranslateGfxAddressForRead; - Value* mpfnTranslateGfxAddressForWrite; - Value* mpParamSimDC; - Value* mpfnTrackMemAccess; - }; -} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h deleted file mode 100644 index 02aa6f97cdf..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h +++ /dev/null @@ -1,34 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file builder_math.h - * - * @brief math/alu builder functions - * - * Notes: - * - ******************************************************************************/ -#pragma once - -Value* VLOG2PS(Value* src); -Value* VPOW24PS(Value* src); -Value* VEXP2PS(Value* src); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp deleted file mode 100644 index b5eb0a782b1..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp +++ /dev/null @@ -1,767 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file builder_misc.cpp - * - * @brief Implementation for miscellaneous builder functions - * - * Notes: - * - ******************************************************************************/ -#include "jit_pch.hpp" -#include "builder.h" - -#include <cstdarg> - -namespace SwrJit -{ - void Builder::AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage) - { - SWR_ASSERT( - ptr->getType() != mInt64Ty, - "Address appears to be GFX access. Requires translation through BuilderGfxMem."); - } - - Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name) - { - return IRB()->CreateGEP(Ptr, Idx, Name); - } - - Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name) - { - return IRB()->CreateGEP(Ty, Ptr, Idx, Name); - } - - Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty) - { - std::vector<Value*> indices; - for (auto i : indexList) - indices.push_back(i); - return GEPA(ptr, indices); - } - - Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty) - { - std::vector<Value*> indices; - for (auto i : indexList) - indices.push_back(C(i)); - return GEPA(ptr, indices); - } - - Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name) - { - return IRB()->CreateGEP(Ptr, IdxList, Name); - } - - Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name) - { - return IRB()->CreateGEP(Ty, Ptr, IdxList, Name); - } - - Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList) - { - std::vector<Value*> indices; - for (auto i : indexList) - indices.push_back(i); - return IN_BOUNDS_GEP(ptr, indices); - } - - Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList) - { - std::vector<Value*> indices; - for (auto i : indexList) - indices.push_back(C(i)); - return IN_BOUNDS_GEP(ptr, indices); - } - - LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage) - { - AssertMemoryUsageParams(Ptr, usage); - return IRB()->CreateLoad(Ptr, Name); - } - - LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage) - { - AssertMemoryUsageParams(Ptr, usage); - return IRB()->CreateLoad(Ptr, Name); - } - - LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, MEM_CLIENT usage) - { - AssertMemoryUsageParams(Ptr, usage); - return IRB()->CreateLoad(Ty, Ptr, Name); - } - - LoadInst* - Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage) - { - AssertMemoryUsageParams(Ptr, usage); - return IRB()->CreateLoad(Ptr, isVolatile, Name); - } - - LoadInst* Builder::LOAD(Value* basePtr, - const std::initializer_list<uint32_t>& indices, - const llvm::Twine& name, - Type* Ty, - MEM_CLIENT usage) - { - std::vector<Value*> valIndices; - for (auto i : indices) - valIndices.push_back(C(i)); - return Builder::LOAD(GEPA(basePtr, valIndices), name); - } - - LoadInst* Builder::LOADV(Value* basePtr, - const std::initializer_list<Value*>& indices, - const llvm::Twine& name) - { - std::vector<Value*> valIndices; - for (auto i : indices) - valIndices.push_back(i); - return LOAD(GEPA(basePtr, valIndices), name); - } - - StoreInst* - Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, MEM_CLIENT usage) - { - std::vector<Value*> valIndices; - for (auto i : indices) - valIndices.push_back(C(i)); - return STORE(val, GEPA(basePtr, valIndices)); - } - - StoreInst* - Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices) - { - std::vector<Value*> valIndices; - for (auto i : indices) - valIndices.push_back(i); - return STORE(val, GEPA(basePtr, valIndices)); - } - - Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset) - { - return GEP(base, offset); - } - - Value* Builder::MEM_ADD(Value* i32Incr, - Value* basePtr, - const std::initializer_list<uint32_t>& indices, - const llvm::Twine& name) - { - Value* i32Value = LOAD(GEP(basePtr, indices), name); - Value* i32Result = ADD(i32Value, i32Incr); - return STORE(i32Result, GEP(basePtr, indices)); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a masked gather operation in LLVM IR. If not - /// supported on the underlying platform, emulate it with loads - /// @param vSrc - SIMD wide value that will be loaded if mask is invalid - /// @param pBase - Int8* base VB address pointer value - /// @param vIndices - SIMD wide value of VB byte offsets - /// @param vMask - SIMD wide mask that controls whether to access memory or the src values - /// @param scale - value to scale indices by - Value* Builder::GATHERPS(Value* vSrc, - Value* pBase, - Value* vIndices, - Value* vMask, - uint8_t scale, - MEM_CLIENT usage) - { - AssertMemoryUsageParams(pBase, usage); - - return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale)); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a masked gather operation in LLVM IR. If not - /// supported on the underlying platform, emulate it with loads - /// @param vSrc - SIMD wide value that will be loaded if mask is invalid - /// @param pBase - Int8* base VB address pointer value - /// @param vIndices - SIMD wide value of VB byte offsets - /// @param vMask - SIMD wide mask that controls whether to access memory or the src values - /// @param scale - value to scale indices by - Value* Builder::GATHERDD(Value* vSrc, - Value* pBase, - Value* vIndices, - Value* vMask, - uint8_t scale, - MEM_CLIENT usage) - { - AssertMemoryUsageParams(pBase, usage); - - return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale)); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a masked gather operation in LLVM IR. If not - /// supported on the underlying platform, emulate it with loads - /// @param vSrc - SIMD wide value that will be loaded if mask is invalid - /// @param pBase - Int8* base VB address pointer value - /// @param vIndices - SIMD wide value of VB byte offsets - /// @param vMask - SIMD wide mask that controls whether to access memory or the src values - /// @param scale - value to scale indices by - Value* - Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) - { - return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Alternative masked gather where source is a vector of pointers - /// @param pVecSrcPtr - SIMD wide vector of pointers - /// @param pVecMask - SIMD active lanes - /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive - Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru) - { - return MASKED_GATHER(pVecSrcPtr, AlignType(4), pVecMask, pVecPassthru); - } - - void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask) - { - MASKED_SCATTER(pVecSrc, pVecDstPtr, AlignType(4), pVecMask); - } - - void Builder::Gather4(const SWR_FORMAT format, - Value* pSrcBase, - Value* byteOffsets, - Value* mask, - Value* vGatherComponents[], - bool bPackedOutput, - MEM_CLIENT usage) - { - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) - { - GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage); - } - else - { - GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage); - } - } - - void Builder::GATHER4PS(const SWR_FORMAT_INFO& info, - Value* pSrcBase, - Value* byteOffsets, - Value* vMask, - Value* vGatherComponents[], - bool bPackedOutput, - MEM_CLIENT usage) - { - switch (info.bpp / info.numComps) - { - case 16: - { - Value* vGatherResult[2]; - - // TODO: vGatherMaskedVal - Value* vGatherMaskedVal = VIMMED1((float)0); - - // always have at least one component out of x or y to fetch - - vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - - // if we have at least one component out of x or y to fetch - if (info.numComps > 2) - { - // offset base to the next components(zw) in the vertex to gather - pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); - - vGatherResult[1] = - GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } - else - { - vGatherResult[1] = vGatherMaskedVal; - } - - // Shuffle gathered components into place, each row is a component - Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); - } - break; - case 32: - { - // apply defaults - for (uint32_t i = 0; i < 4; ++i) - { - vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); - } - - for (uint32_t i = 0; i < info.numComps; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - - // Gather a SIMD of components - vGatherComponents[swizzleIndex] = GATHERPS( - vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); - - // offset base to the next component to gather - pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); - } - } - break; - default: - SWR_INVALID("Invalid float format"); - break; - } - } - - void Builder::GATHER4DD(const SWR_FORMAT_INFO& info, - Value* pSrcBase, - Value* byteOffsets, - Value* vMask, - Value* vGatherComponents[], - bool bPackedOutput, - MEM_CLIENT usage) - { - switch (info.bpp / info.numComps) - { - case 8: - { - Value* vGatherMaskedVal = VIMMED1((int32_t)0); - Value* vGatherResult = - GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); - // e.g. result of an 8x32bit integer gather for 8bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw - - Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); - } - break; - case 16: - { - Value* vGatherResult[2]; - - // TODO: vGatherMaskedVal - Value* vGatherMaskedVal = VIMMED1((int32_t)0); - - // always have at least one component out of x or y to fetch - - vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - - // if we have at least one component out of x or y to fetch - if (info.numComps > 2) - { - // offset base to the next components(zw) in the vertex to gather - pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); - - vGatherResult[1] = - GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } - else - { - vGatherResult[1] = vGatherMaskedVal; - } - - // Shuffle gathered components into place, each row is a component - Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); - } - break; - case 32: - { - // apply defaults - for (uint32_t i = 0; i < 4; ++i) - { - vGatherComponents[i] = VIMMED1((int)info.defaults[i]); - } - - for (uint32_t i = 0; i < info.numComps; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - - // Gather a SIMD of components - vGatherComponents[swizzleIndex] = GATHERDD( - vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); - - // offset base to the next component to gather - pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); - } - } - break; - default: - SWR_INVALID("unsupported format"); - break; - } - } - - void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info, - Value* vGatherInput[2], - Value* vGatherOutput[4], - bool bPackedOutput) - { - // cast types - Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - - // input could either be float or int vector; do shuffle work in int - vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); - vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); - - if (bPackedOutput) - { - Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), - mVWidth / 4); // vwidth is units of 32 bits - - // shuffle mask - Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); - Value* vShufResult = - BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - - Value* vi128XY = - BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - // after PERMD: move and pack xy components into each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy - - // do the same for zw components - Value* vi128ZW = nullptr; - if (info.numComps > 2) - { - Value* vShufResult = - BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); - vi128ZW = - BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - } - - for (uint32_t i = 0; i < 4; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - // todo: fixed for packed - Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); - if (i >= info.numComps) - { - // set the default component val - vGatherOutput[swizzleIndex] = vGatherMaskedVal; - continue; - } - - // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 - uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - - // extract packed component 128 bit lanes - vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); - } - } - else - { - // pshufb masks for each component - Value* vConstMask[2]; - // x/z shuffle mask - vConstMask[0] = C<char>({ - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - }); - - // y/w shuffle mask - vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, - 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); - - // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits - // apply defaults - for (uint32_t i = 0; i < 4; ++i) - { - vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); - } - - for (uint32_t i = 0; i < info.numComps; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - - // select correct constMask for x/z or y/w pshufb - uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - uint32_t selectedGather = (i < 2) ? 0 : 1; - - vGatherOutput[swizzleIndex] = - BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), - vConstMask[selectedMask]), - vGatherTy); - // after pshufb mask for x channel; z uses the same shuffle from the second gather - // 256i - 0 1 2 3 4 5 6 7 - // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 - } - } - } - - void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info, - Value* vGatherInput, - Value* vGatherOutput[], - bool bPackedOutput) - { - // cast types - Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - - if (bPackedOutput) - { - Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), - mVWidth / 4); // vwidth is units of 32 bits - // shuffle mask - Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, - 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); - Value* vShufResult = - BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww - - Value* vi128XY = - BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); - // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) - - // do the same for zw components - Value* vi128ZW = nullptr; - if (info.numComps > 2) - { - vi128ZW = - BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); - } - - // sign extend all enabled components. If we have a fill vVertexElements, output to - // current simdvertex - for (uint32_t i = 0; i < 4; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - // todo: fix for packed - Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); - if (i >= info.numComps) - { - // set the default component val - vGatherOutput[swizzleIndex] = vGatherMaskedVal; - continue; - } - - // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 - uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - - // sign extend - vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); - } - } - // else zero extend - else - { - // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits - // apply defaults - for (uint32_t i = 0; i < 4; ++i) - { - vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); - } - - for (uint32_t i = 0; i < info.numComps; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - - // pshufb masks for each component - Value* vConstMask; - switch (i) - { - case 0: - // x shuffle mask - vConstMask = - C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, - 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); - break; - case 1: - // y shuffle mask - vConstMask = - C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, - 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); - break; - case 2: - // z shuffle mask - vConstMask = - C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, - 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); - break; - case 3: - // w shuffle mask - vConstMask = - C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, - 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); - break; - default: - vConstMask = nullptr; - break; - } - - assert(vConstMask && "Invalid info.numComps value"); - vGatherOutput[swizzleIndex] = - BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); - // after pshufb for x channel - // 256i - 0 1 2 3 4 5 6 7 - // x000 x000 x000 x000 x000 x000 x000 x000 - } - } - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief emulates a scatter operation. - /// @param pDst - pointer to destination - /// @param vSrc - vector of src data to scatter - /// @param vOffsets - vector of byte offsets from pDst - /// @param vMask - mask of valid lanes - void Builder::SCATTERPS( - Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage) - { - AssertMemoryUsageParams(pDst, usage); -#if LLVM_VERSION_MAJOR >= 11 - SWR_ASSERT(cast<VectorType>(vSrc->getType())->getElementType()->isFloatTy()); -#else - SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy()); -#endif - VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1)); - return; - - /* Scatter algorithm - - while(Index = BitScanForward(mask)) - srcElem = srcVector[Index] - offsetElem = offsetVector[Index] - *(pDst + offsetElem) = srcElem - Update mask (&= ~(1<<Index) - - */ - - /* - - // Reference implementation kept around for reference - - BasicBlock* pCurBB = IRB()->GetInsertBlock(); - Function* pFunc = pCurBB->getParent(); - Type* pSrcTy = vSrc->getType()->getVectorElementType(); - - // Store vectors on stack - if (pScatterStackSrc == nullptr) - { - // Save off stack allocations and reuse per scatter. Significantly reduces stack - // requirements for shaders with a lot of scatters. - pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); - pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty); - } - - Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); - Value* pOffsetsArrayPtr = pScatterStackOffsets; - STORE(vSrc, pSrcArrayPtr); - STORE(vOffsets, pOffsetsArrayPtr); - - // Cast to pointers for random access - pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); - pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0)); - - Value* pMask = VMOVMSK(vMask); - - // Setup loop basic block - BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc); - - // compute first set bit - Value* pIndex = CTTZ(pMask, C(false)); - - Value* pIsUndef = ICMP_EQ(pIndex, C(32)); - - // Split current block or create new one if building inline - BasicBlock* pPostLoop; - if (pCurBB->getTerminator()) - { - pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode()); - - // Remove unconditional jump created by splitBasicBlock - pCurBB->getTerminator()->eraseFromParent(); - - // Add terminator to end of original block - IRB()->SetInsertPoint(pCurBB); - - // Add conditional branch - COND_BR(pIsUndef, pPostLoop, pLoop); - } - else - { - pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc); - - // Add conditional branch - COND_BR(pIsUndef, pPostLoop, pLoop); - } - - // Add loop basic block contents - IRB()->SetInsertPoint(pLoop); - PHINode* pIndexPhi = PHI(mInt32Ty, 2); - PHINode* pMaskPhi = PHI(mInt32Ty, 2); - - pIndexPhi->addIncoming(pIndex, pCurBB); - pMaskPhi->addIncoming(pMask, pCurBB); - - // Extract elements for this index - Value* pSrcElem = LOADV(pSrcArrayPtr, {pIndexPhi}); - Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi}); - - // GEP to this offset in dst - Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy); - pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); - STORE(pSrcElem, pCurDst); - - // Update the mask - Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi))); - - // Terminator - Value* pNewIndex = CTTZ(pNewMask, C(false)); - - pIsUndef = ICMP_EQ(pNewIndex, C(32)); - COND_BR(pIsUndef, pPostLoop, pLoop); - - // Update phi edges - pIndexPhi->addIncoming(pNewIndex, pLoop); - pMaskPhi->addIncoming(pNewMask, pLoop); - - // Move builder to beginning of post loop - IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); - - */ - } -} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h deleted file mode 100644 index 429d5779a4d..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h +++ /dev/null @@ -1,170 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file builder_misc.h - * - * @brief miscellaneous builder functions - * - * Notes: - * - ******************************************************************************/ -#pragma once - -public: -enum class MEM_CLIENT -{ - MEM_CLIENT_INTERNAL, - GFX_MEM_CLIENT_FETCH, - GFX_MEM_CLIENT_SAMPLER, - GFX_MEM_CLIENT_SHADER, - GFX_MEM_CLIENT_STREAMOUT, - GFX_MEM_CLIENT_URB -}; - -protected: -virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset); -void AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage); - -public: -virtual Value* GEP(Value* Ptr, Value* Idx, Type* Ty = nullptr, bool isReadOnly = true, const Twine& Name = ""); -virtual Value* GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name = ""); -virtual Value* GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty = nullptr); -virtual Value* -GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty = nullptr); - -Value* GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name = ""); -Value* GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name = ""); - -Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList); -Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList); - -virtual LoadInst* - LOAD(Value* Ptr, const char* Name, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); -virtual LoadInst* LOAD(Value* Ptr, - const Twine& Name = "", - Type* Ty = nullptr, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); -virtual LoadInst* - LOAD(Type* Ty, Value* Ptr, const Twine& Name = "", MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); -virtual LoadInst* LOAD(Value* Ptr, - bool isVolatile, - const Twine& Name = "", - Type* Ty = nullptr, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); -virtual LoadInst* LOAD(Value* BasePtr, - const std::initializer_list<uint32_t>& offset, - const llvm::Twine& Name = "", - Type* Ty = nullptr, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - -virtual CallInst* MASKED_LOAD(Value* Ptr, - unsigned Align, - Value* Mask, - Value* PassThru = nullptr, - const Twine& Name = "", - Type* Ty = nullptr, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL) -{ - return IRB()->CreateMaskedLoad(Ptr, AlignType(Align), Mask, PassThru, Name); -} - -virtual StoreInst* STORE(Value *Val, Value *Ptr, bool isVolatile = false, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL) -{ - return IRB()->CreateStore(Val, Ptr, isVolatile); -} - -virtual StoreInst* STORE(Value* Val, Value* BasePtr, const std::initializer_list<uint32_t>& offset, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - -virtual CallInst* MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL) -{ - return IRB()->CreateMaskedStore(Val, Ptr, AlignType(Align), Mask); -} - -LoadInst* LOADV(Value* BasePtr, const std::initializer_list<Value*>& offset, const llvm::Twine& name = ""); -StoreInst* STOREV(Value* Val, Value* BasePtr, const std::initializer_list<Value*>& offset); - -Value* MEM_ADD(Value* i32Incr, - Value* basePtr, - const std::initializer_list<uint32_t>& indices, - const llvm::Twine& name = ""); - -void Gather4(const SWR_FORMAT format, - Value* pSrcBase, - Value* byteOffsets, - Value* mask, - Value* vGatherComponents[], - bool bPackedOutput, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - -virtual Value* GATHERPS(Value* src, - Value* pBase, - Value* indices, - Value* mask, - uint8_t scale = 1, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - -void GATHER4PS(const SWR_FORMAT_INFO& info, - Value* pSrcBase, - Value* byteOffsets, - Value* mask, - Value* vGatherComponents[], - bool bPackedOutput, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - -virtual Value* GATHERDD(Value* src, - Value* pBase, - Value* indices, - Value* mask, - uint8_t scale = 1, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - -void GATHER4DD(const SWR_FORMAT_INFO& info, - Value* pSrcBase, - Value* byteOffsets, - Value* mask, - Value* vGatherComponents[], - bool bPackedOutput, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - -Value* GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); - -Value* GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru); -void SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask); - -virtual void SCATTERPS(Value* pDst, - Value* vSrc, - Value* vOffsets, - Value* vMask, - MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL); - -void Shuffle8bpcGather4(const SWR_FORMAT_INFO& info, - Value* vGatherInput, - Value* vGatherOutput[], - bool bPackedOutput); -void Shuffle16bpcGather4(const SWR_FORMAT_INFO& info, - Value* vGatherInput[], - Value* vGatherOutput[], - bool bPackedOutput); - -// Static stack allocations for scatter operations -Value* pScatterStackSrc{nullptr}; -Value* pScatterStackOffsets{nullptr}; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp deleted file mode 100644 index 8080a40a1f9..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ /dev/null @@ -1,1125 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file builder_misc.cpp - * - * @brief Implementation for miscellaneous builder functions - * - * Notes: - * - ******************************************************************************/ -#include "jit_pch.hpp" -#include "builder.h" -#include "common/rdtsc_buckets.h" - -#include <cstdarg> - -extern "C" void CallPrint(const char* fmt, ...); - -namespace SwrJit -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Convert an IEEE 754 32-bit single precision float to an - /// 16 bit float with 5 exponent bits and a variable - /// number of mantissa bits. - /// @param val - 32-bit float - /// @todo Maybe move this outside of this file into a header? - static uint16_t ConvertFloat32ToFloat16(float val) - { - uint32_t sign, exp, mant; - uint32_t roundBits; - - // Extract the sign, exponent, and mantissa - uint32_t uf = *(uint32_t*)&val; - sign = (uf & 0x80000000) >> 31; - exp = (uf & 0x7F800000) >> 23; - mant = uf & 0x007FFFFF; - - // Check for out of range - if (std::isnan(val)) - { - exp = 0x1F; - mant = 0x200; - sign = 1; // set the sign bit for NANs - } - else if (std::isinf(val)) - { - exp = 0x1f; - mant = 0x0; - } - else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value - { - exp = 0x1E; - mant = 0x3FF; - } - else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm - { - mant |= 0x00800000; - for (; exp <= 0x70; mant >>= 1, exp++) - ; - exp = 0; - mant = mant >> 13; - } - else if (exp < 0x66) // Too small to represent -> Zero - { - exp = 0; - mant = 0; - } - else - { - // Saves bits that will be shifted off for rounding - roundBits = mant & 0x1FFFu; - // convert exponent and mantissa to 16 bit format - exp = exp - 0x70; - mant = mant >> 13; - - // Essentially RTZ, but round up if off by only 1 lsb - if (roundBits == 0x1FFFu) - { - mant++; - // check for overflow - if ((mant & 0xC00u) != 0) - exp++; - // make sure only the needed bits are used - mant &= 0x3FF; - } - } - - uint32_t tmpVal = (sign << 15) | (exp << 10) | mant; - return (uint16_t)tmpVal; - } - - Constant* Builder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); } - - Constant* Builder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); } - - Constant* Builder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); } - - Constant* Builder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); } - - Constant* Builder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); } - - Constant* Builder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); } - - Constant* Builder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); } - - Constant* Builder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); } - - Constant* Builder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); } - - Constant* Builder::PRED(bool pred) - { - return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); - } - - Value* Builder::VIMMED1(uint64_t i) - { -#if LLVM_VERSION_MAJOR <= 10 - return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); -#elif LLVM_VERSION_MAJOR == 11 - return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i))); -#else - return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i))); -#endif - } - - Value* Builder::VIMMED1_16(uint64_t i) - { -#if LLVM_VERSION_MAJOR <= 10 - return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); -#elif LLVM_VERSION_MAJOR == 11 - return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i))); -#else - return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i))); -#endif - } - - Value* Builder::VIMMED1(int i) - { -#if LLVM_VERSION_MAJOR <= 10 - return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); -#elif LLVM_VERSION_MAJOR == 11 - return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i))); -#else - return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i))); -#endif - } - - Value* Builder::VIMMED1_16(int i) - { -#if LLVM_VERSION_MAJOR <= 10 - return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); -#elif LLVM_VERSION_MAJOR == 11 - return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i))); -#else - return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i))); -#endif - } - - Value* Builder::VIMMED1(uint32_t i) - { -#if LLVM_VERSION_MAJOR <= 10 - return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); -#elif LLVM_VERSION_MAJOR == 11 - return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i))); -#else - return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i))); -#endif - } - - Value* Builder::VIMMED1_16(uint32_t i) - { -#if LLVM_VERSION_MAJOR <= 10 - return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); -#elif LLVM_VERSION_MAJOR == 11 - return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i))); -#else - return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i))); -#endif - } - - Value* Builder::VIMMED1(float i) - { -#if LLVM_VERSION_MAJOR <= 10 - return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); -#elif LLVM_VERSION_MAJOR == 11 - return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantFP>(C(i))); -#else - return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantFP>(C(i))); -#endif - } - - Value* Builder::VIMMED1_16(float i) - { -#if LLVM_VERSION_MAJOR <= 10 - return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i))); -#elif LLVM_VERSION_MAJOR == 11 - return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantFP>(C(i))); -#else - return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantFP>(C(i))); -#endif - } - - Value* Builder::VIMMED1(bool i) - { -#if LLVM_VERSION_MAJOR <= 10 - return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); -#elif LLVM_VERSION_MAJOR == 11 - return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i))); -#else - return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i))); -#endif - } - - Value* Builder::VIMMED1_16(bool i) - { -#if LLVM_VERSION_MAJOR <= 10 - return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); -#elif LLVM_VERSION_MAJOR == 11 - return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i))); -#else - return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i))); -#endif - } - - Value* Builder::VUNDEF_IPTR() { return UndefValue::get(getVectorType(mInt32PtrTy, mVWidth)); } - - Value* Builder::VUNDEF(Type* t) { return UndefValue::get(getVectorType(t, mVWidth)); } - - Value* Builder::VUNDEF_I() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth)); } - - Value* Builder::VUNDEF_I_16() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth16)); } - - Value* Builder::VUNDEF_F() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth)); } - - Value* Builder::VUNDEF_F_16() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth16)); } - - Value* Builder::VUNDEF(Type* ty, uint32_t size) - { - return UndefValue::get(getVectorType(ty, size)); - } - - Value* Builder::VBROADCAST(Value* src, const llvm::Twine& name) - { - // check if src is already a vector - if (src->getType()->isVectorTy()) - { - return src; - } - - return VECTOR_SPLAT(mVWidth, src, name); - } - - Value* Builder::VBROADCAST_16(Value* src) - { - // check if src is already a vector - if (src->getType()->isVectorTy()) - { - return src; - } - - return VECTOR_SPLAT(mVWidth16, src); - } - - uint32_t Builder::IMMED(Value* v) - { - SWR_ASSERT(isa<ConstantInt>(v)); - ConstantInt* pValConst = cast<ConstantInt>(v); - return pValConst->getZExtValue(); - } - - int32_t Builder::S_IMMED(Value* v) - { - SWR_ASSERT(isa<ConstantInt>(v)); - ConstantInt* pValConst = cast<ConstantInt>(v); - return pValConst->getSExtValue(); - } - - CallInst* Builder::CALL(Value* Callee, - const std::initializer_list<Value*>& argsList, - const llvm::Twine& name) - { - std::vector<Value*> args; - for (auto arg : argsList) - args.push_back(arg); -#if LLVM_VERSION_MAJOR >= 11 - // see comment to CALLA(Callee) function in the header - return CALLA(FunctionCallee(cast<Function>(Callee)), args, name); -#else - return CALLA(Callee, args, name); -#endif - } - - CallInst* Builder::CALL(Value* Callee, Value* arg) - { - std::vector<Value*> args; - args.push_back(arg); -#if LLVM_VERSION_MAJOR >= 11 - // see comment to CALLA(Callee) function in the header - return CALLA(FunctionCallee(cast<Function>(Callee)), args); -#else - return CALLA(Callee, args); -#endif - } - - CallInst* Builder::CALL2(Value* Callee, Value* arg1, Value* arg2) - { - std::vector<Value*> args; - args.push_back(arg1); - args.push_back(arg2); -#if LLVM_VERSION_MAJOR >= 11 - // see comment to CALLA(Callee) function in the header - return CALLA(FunctionCallee(cast<Function>(Callee)), args); -#else - return CALLA(Callee, args); -#endif - } - - CallInst* Builder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3) - { - std::vector<Value*> args; - args.push_back(arg1); - args.push_back(arg2); - args.push_back(arg3); -#if LLVM_VERSION_MAJOR >= 11 - // see comment to CALLA(Callee) function in the header - return CALLA(FunctionCallee(cast<Function>(Callee)), args); -#else - return CALLA(Callee, args); -#endif - } - - Value* Builder::VRCP(Value* va, const llvm::Twine& name) - { - return FDIV(VIMMED1(1.0f), va, name); // 1 / a - } - - Value* Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY) - { - Value* vOut = FMADDPS(vA, vX, vC); - vOut = FMADDPS(vB, vY, vOut); - return vOut; - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief insert a JIT call to CallPrint - /// - outputs formatted string to both stdout and VS output window - /// - DEBUG builds only - /// Usage example: - /// PRINT("index %d = 0x%p\n",{C(lane), pIndex}); - /// where C(lane) creates a constant value to print, and pIndex is the Value* - /// result from a GEP, printing out the pointer to memory - /// @param printStr - constant string to print, which includes format specifiers - /// @param printArgs - initializer list of Value*'s to print to std out - CallInst* Builder::PRINT(const std::string& printStr, - const std::initializer_list<Value*>& printArgs) - { - // push the arguments to CallPrint into a vector - std::vector<Value*> printCallArgs; - // save room for the format string. we still need to modify it for vectors - printCallArgs.resize(1); - - // search through the format string for special processing - size_t pos = 0; - std::string tempStr(printStr); - pos = tempStr.find('%', pos); - auto v = printArgs.begin(); - - while ((pos != std::string::npos) && (v != printArgs.end())) - { - Value* pArg = *v; - Type* pType = pArg->getType(); - - if (pType->isVectorTy()) - { - Type* pContainedType = pType->getContainedType(0); -#if LLVM_VERSION_MAJOR >= 12 - FixedVectorType* pVectorType = cast<FixedVectorType>(pType); -#elif LLVM_VERSION_MAJOR >= 11 - VectorType* pVectorType = cast<VectorType>(pType); -#endif - if (toupper(tempStr[pos + 1]) == 'X') - { - tempStr[pos] = '0'; - tempStr[pos + 1] = 'x'; - tempStr.insert(pos + 2, "%08X "); - pos += 7; - - printCallArgs.push_back(VEXTRACT(pArg, C(0))); - - std::string vectorFormatStr; -#if LLVM_VERSION_MAJOR >= 11 - for (uint32_t i = 1; i < pVectorType->getNumElements(); ++i) -#else - for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i) -#endif - { - vectorFormatStr += "0x%08X "; - printCallArgs.push_back(VEXTRACT(pArg, C(i))); - } - - tempStr.insert(pos, vectorFormatStr); - pos += vectorFormatStr.size(); - } - else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy())) - { - uint32_t i = 0; -#if LLVM_VERSION_MAJOR >= 11 - for (; i < pVectorType->getNumElements() - 1; i++) -#else - for (; i < pType->getVectorNumElements() - 1; i++) -#endif - { - tempStr.insert(pos, std::string("%f ")); - pos += 3; - printCallArgs.push_back( - FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); - } - printCallArgs.push_back( - FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); - } - else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy())) - { - uint32_t i = 0; -#if LLVM_VERSION_MAJOR >= 11 - for (; i < pVectorType->getNumElements() - 1; i++) -#else - for (; i < pType->getVectorNumElements() - 1; i++) -#endif - { - tempStr.insert(pos, std::string("%d ")); - pos += 3; - printCallArgs.push_back( - S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); - } - printCallArgs.push_back( - S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); - } - else if ((tempStr[pos + 1] == 'u') && (pContainedType->isIntegerTy())) - { - uint32_t i = 0; -#if LLVM_VERSION_MAJOR >= 11 - for (; i < pVectorType->getNumElements() - 1; i++) -#else - for (; i < pType->getVectorNumElements() - 1; i++) -#endif - { - tempStr.insert(pos, std::string("%d ")); - pos += 3; - printCallArgs.push_back( - Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); - } - printCallArgs.push_back( - Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); - } - } - else - { - if (toupper(tempStr[pos + 1]) == 'X') - { - tempStr[pos] = '0'; - tempStr.insert(pos + 1, "x%08"); - printCallArgs.push_back(pArg); - pos += 3; - } - // for %f we need to cast float Values to doubles so that they print out correctly - else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy())) - { - printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext))); - pos++; - } - else - { - printCallArgs.push_back(pArg); - } - } - - // advance to the next argument - v++; - pos = tempStr.find('%', ++pos); - } - - // create global variable constant string - Constant* constString = ConstantDataArray::getString(JM()->mContext, tempStr, true); - GlobalVariable* gvPtr = new GlobalVariable( - constString->getType(), true, GlobalValue::InternalLinkage, constString, "printStr"); - JM()->mpCurrentModule->getGlobalList().push_back(gvPtr); - - // get a pointer to the first character in the constant string array - std::vector<Constant*> geplist{C(0), C(0)}; - Constant* strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr, geplist, false); - - // insert the pointer to the format string in the argument vector - printCallArgs[0] = strGEP; - - // get pointer to CallPrint function and insert decl into the module if needed - std::vector<Type*> args; - args.push_back(PointerType::get(mInt8Ty, 0)); - FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, true); - Function* callPrintFn = -#if LLVM_VERSION_MAJOR >= 9 - cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy).getCallee()); -#else - cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); -#endif - - // if we haven't yet added the symbol to the symbol table - if ((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) - { - sys::DynamicLibrary::AddSymbol("CallPrint", (void*)&CallPrint); - } - - // insert a call to CallPrint - return CALLA(callPrintFn, printCallArgs); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Wrapper around PRINT with initializer list. - CallInst* Builder::PRINT(const std::string& printStr) { return PRINT(printStr, {}); } - - Value* Builder::EXTRACT_16(Value* x, uint32_t imm) - { - if (imm == 0) - { - return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7}); - } - else - { - return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15}); - } - } - - Value* Builder::JOIN_16(Value* a, Value* b) - { - return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief convert x86 <N x float> mask to llvm <N x i1> mask - Value* Builder::MASK(Value* vmask) - { - Value* src = BITCAST(vmask, mSimdInt32Ty); - return ICMP_SLT(src, VIMMED1(0)); - } - - Value* Builder::MASK_16(Value* vmask) - { - Value* src = BITCAST(vmask, mSimd16Int32Ty); - return ICMP_SLT(src, VIMMED1_16(0)); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask - Value* Builder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); } - - Value* Builder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); } - - /// @brief Convert <Nxi1> llvm mask to integer - Value* Builder::VMOVMSK(Value* mask) - { -#if LLVM_VERSION_MAJOR >= 11 -#if LLVM_VERSION_MAJOR >= 12 - FixedVectorType* pVectorType = cast<FixedVectorType>(mask->getType()); -#else - VectorType* pVectorType = cast<VectorType>(mask->getType()); -#endif - SWR_ASSERT(pVectorType->getElementType() == mInt1Ty); - uint32_t numLanes = pVectorType->getNumElements(); -#else - SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty); - uint32_t numLanes = mask->getType()->getVectorNumElements(); -#endif - Value* i32Result; - if (numLanes == 8) - { - i32Result = BITCAST(mask, mInt8Ty); - } - else if (numLanes == 16) - { - i32Result = BITCAST(mask, mInt16Ty); - } - else - { - SWR_ASSERT("Unsupported vector width"); - i32Result = BITCAST(mask, mInt8Ty); - } - return Z_EXT(i32Result, mInt32Ty); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPSHUFB operation in LLVM IR. If not - /// supported on the underlying platform, emulate it - /// @param a - 256bit SIMD(32x8bit) of 8bit integer values - /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values - /// Byte masks in lower 128 lane of b selects 8 bit values from lower - /// 128bits of a, and vice versa for the upper lanes. If the mask - /// value is negative, '0' is inserted. - Value* Builder::PSHUFB(Value* a, Value* b) - { - Value* res; - // use avx2 pshufb instruction if available - if (JM()->mArch.AVX2()) - { - res = VPSHUFB(a, b); - } - else - { - Constant* cB = dyn_cast<Constant>(b); - assert(cB != nullptr); - // number of 8 bit elements in b -#if LLVM_VERSION_MAJOR >= 12 - uint32_t numElms = cast<FixedVectorType>(cB->getType())->getNumElements(); -#else - uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements(); -#endif - // output vector - Value* vShuf = UndefValue::get(getVectorType(mInt8Ty, numElms)); - - // insert an 8 bit value from the high and low lanes of a per loop iteration - numElms /= 2; - for (uint32_t i = 0; i < numElms; i++) - { - ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i)); - ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms)); - - // extract values from constant mask - char valLow128bLane = (char)(cLow128b->getSExtValue()); - char valHigh128bLane = (char)(cHigh128b->getSExtValue()); - - Value* insertValLow128b; - Value* insertValHigh128b; - - // if the mask value is negative, insert a '0' in the respective output position - // otherwise, lookup the value at mask position (bits 3..0 of the respective mask - // byte) in a and insert in output vector - insertValLow128b = - (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); - insertValHigh128b = (valHigh128bLane < 0) - ? C((char)0) - : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); - - vShuf = VINSERT(vShuf, insertValLow128b, i); - vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); - } - res = vShuf; - } - return res; - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 - /// bits)in LLVM IR. If not supported on the underlying platform, emulate it - /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only - /// lower 8 values are used. - Value* Builder::PMOVSXBD(Value* a) - { - // VPMOVSXBD output type - Type* v8x32Ty = getVectorType(mInt32Ty, 8); - // Extract 8 values from 128bit lane and sign extend - return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 - /// bits)in LLVM IR. If not supported on the underlying platform, emulate it - /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. - Value* Builder::PMOVSXWD(Value* a) - { - // VPMOVSXWD output type - Type* v8x32Ty = getVectorType(mInt32Ty, 8); - // Extract 8 values from 128bit lane and sign extend - return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) - /// in LLVM IR. If not supported on the underlying platform, emulate it - /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. - Value* Builder::CVTPH2PS(Value* a, const llvm::Twine& name) - { - // Bitcast Nxint16 to Nxhalf -#if LLVM_VERSION_MAJOR >= 12 - uint32_t numElems = cast<FixedVectorType>(a->getType())->getNumElements(); -#elif LLVM_VERSION_MAJOR >= 11 - uint32_t numElems = cast<VectorType>(a->getType())->getNumElements(); -#else - uint32_t numElems = a->getType()->getVectorNumElements(); -#endif - Value* input = BITCAST(a, getVectorType(mFP16Ty, numElems)); - - return FP_EXT(input, getVectorType(mFP32Ty, numElems), name); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion) - /// in LLVM IR. If not supported on the underlying platform, emulate it - /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. - Value* Builder::CVTPS2PH(Value* a, Value* rounding) - { - if (JM()->mArch.F16C()) - { - return VCVTPS2PH(a, rounding); - } - else - { - // call scalar C function for now - FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); - Function* pCvtPs2Ph = cast<Function>( -#if LLVM_VERSION_MAJOR >= 9 - JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy).getCallee()); -#else - JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy)); -#endif - - if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr) - { - sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", - (void*)&ConvertFloat32ToFloat16); - } - - Value* pResult = UndefValue::get(mSimdInt16Ty); - for (uint32_t i = 0; i < mVWidth; ++i) - { - Value* pSrc = VEXTRACT(a, C(i)); - Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc}); - pResult = VINSERT(pResult, pConv, C(i)); - } - - return pResult; - } - } - - Value* Builder::PMAXSD(Value* a, Value* b) - { - Value* cmp = ICMP_SGT(a, b); - return SELECT(cmp, a, b); - } - - Value* Builder::PMINSD(Value* a, Value* b) - { - Value* cmp = ICMP_SLT(a, b); - return SELECT(cmp, a, b); - } - - Value* Builder::PMAXUD(Value* a, Value* b) - { - Value* cmp = ICMP_UGT(a, b); - return SELECT(cmp, a, b); - } - - Value* Builder::PMINUD(Value* a, Value* b) - { - Value* cmp = ICMP_ULT(a, b); - return SELECT(cmp, a, b); - } - - // Helper function to create alloca in entry block of function - Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType) - { - auto saveIP = IRB()->saveIP(); - IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin()); - Value* pAlloca = ALLOCA(pType); - if (saveIP.isSet()) - IRB()->restoreIP(saveIP); - return pAlloca; - } - - Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize) - { - auto saveIP = IRB()->saveIP(); - IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin()); - Value* pAlloca = ALLOCA(pType, pArraySize); - if (saveIP.isSet()) - IRB()->restoreIP(saveIP); - return pAlloca; - } - - Value* Builder::VABSPS(Value* a) - { - Value* asInt = BITCAST(a, mSimdInt32Ty); - Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); - return result; - } - - Value* Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name) - { - Value* lowCmp = ICMP_SLT(src, low); - Value* ret = SELECT(lowCmp, low, src); - - Value* highCmp = ICMP_SGT(ret, high); - ret = SELECT(highCmp, high, ret, name); - - return ret; - } - - Value* Builder::FCLAMP(Value* src, Value* low, Value* high) - { - Value* lowCmp = FCMP_OLT(src, low); - Value* ret = SELECT(lowCmp, low, src); - - Value* highCmp = FCMP_OGT(ret, high); - ret = SELECT(highCmp, high, ret); - - return ret; - } - - Value* Builder::FCLAMP(Value* src, float low, float high) - { - Value* result = VMAXPS(src, VIMMED1(low)); - result = VMINPS(result, VIMMED1(high)); - - return result; - } - - Value* Builder::FMADDPS(Value* a, Value* b, Value* c) - { - Value* vOut; - // This maps to LLVM fmuladd intrinsic - vOut = VFMADDPS(a, b, c); - return vOut; - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief pop count on vector mask (e.g. <8 x i1>) - Value* Builder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Float / Fixed-point conversions - ////////////////////////////////////////////////////////////////////////// - Value* Builder::VCVT_F32_FIXED_SI(Value* vFloat, - uint32_t numIntBits, - uint32_t numFracBits, - const llvm::Twine& name) - { - SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values"); - Value* fixed = nullptr; - -#if 0 // This doesn't work for negative numbers!! - { - fixed = FP_TO_SI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))), - C(_MM_FROUND_TO_NEAREST_INT)), - mSimdInt32Ty); - } - else -#endif - { - // Do round to nearest int on fractional bits first - // Not entirely perfect for negative numbers, but close enough - vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))), - C(_MM_FROUND_TO_NEAREST_INT)); - vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits))); - - // TODO: Handle INF, NAN, overflow / underflow, etc. - - Value* vSgn = FCMP_OLT(vFloat, VIMMED1(0.0f)); - Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty); - Value* vFixed = AND(vFloatInt, VIMMED1((1 << 23) - 1)); - vFixed = OR(vFixed, VIMMED1(1 << 23)); - vFixed = SELECT(vSgn, NEG(vFixed), vFixed); - - Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24)); - vExp = SUB(vExp, VIMMED1(127)); - - Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp); - - fixed = ASHR(vFixed, vExtraBits, name); - } - - return fixed; - } - - Value* Builder::VCVT_FIXED_SI_F32(Value* vFixed, - uint32_t numIntBits, - uint32_t numFracBits, - const llvm::Twine& name) - { - SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values"); - uint32_t extraBits = 32 - numIntBits - numFracBits; - if (numIntBits && extraBits) - { - // Sign extend - Value* shftAmt = VIMMED1(extraBits); - vFixed = ASHR(SHL(vFixed, shftAmt), shftAmt); - } - - Value* fVal = VIMMED1(0.0f); - Value* fFrac = VIMMED1(0.0f); - if (numIntBits) - { - fVal = SI_TO_FP(ASHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name); - } - - if (numFracBits) - { - fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty); - fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name); - } - - return FADD(fVal, fFrac, name); - } - - Value* Builder::VCVT_F32_FIXED_UI(Value* vFloat, - uint32_t numIntBits, - uint32_t numFracBits, - const llvm::Twine& name) - { - SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values"); - Value* fixed = nullptr; -#if 1 // KNOB_SIM_FAST_MATH? Below works correctly from a precision - // standpoint... - { - fixed = FP_TO_UI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))), - C(_MM_FROUND_TO_NEAREST_INT)), - mSimdInt32Ty); - } -#else - { - // Do round to nearest int on fractional bits first - vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))), - C(_MM_FROUND_TO_NEAREST_INT)); - vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits))); - - // TODO: Handle INF, NAN, overflow / underflow, etc. - - Value* vSgn = FCMP_OLT(vFloat, VIMMED1(0.0f)); - Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty); - Value* vFixed = AND(vFloatInt, VIMMED1((1 << 23) - 1)); - vFixed = OR(vFixed, VIMMED1(1 << 23)); - - Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24)); - vExp = SUB(vExp, VIMMED1(127)); - - Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp); - - fixed = LSHR(vFixed, vExtraBits, name); - } -#endif - return fixed; - } - - Value* Builder::VCVT_FIXED_UI_F32(Value* vFixed, - uint32_t numIntBits, - uint32_t numFracBits, - const llvm::Twine& name) - { - SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values"); - uint32_t extraBits = 32 - numIntBits - numFracBits; - if (numIntBits && extraBits) - { - // Sign extend - Value* shftAmt = VIMMED1(extraBits); - vFixed = ASHR(SHL(vFixed, shftAmt), shftAmt); - } - - Value* fVal = VIMMED1(0.0f); - Value* fFrac = VIMMED1(0.0f); - if (numIntBits) - { - fVal = UI_TO_FP(LSHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name); - } - - if (numFracBits) - { - fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty); - fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name); - } - - return FADD(fVal, fFrac, name); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief C functions called by LLVM IR - ////////////////////////////////////////////////////////////////////////// - - Value* Builder::VEXTRACTI128(Value* a, Constant* imm8) - { - bool flag = !imm8->isZeroValue(); - SmallVector<Constant*, 8> idx; - for (unsigned i = 0; i < mVWidth / 2; i++) - { - idx.push_back(C(flag ? i + mVWidth / 2 : i)); - } - return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); - } - - Value* Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) - { - bool flag = !imm8->isZeroValue(); - SmallVector<Constant*, 8> idx; - for (unsigned i = 0; i < mVWidth; i++) - { - idx.push_back(C(i)); - } - Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); - - SmallVector<Constant*, 8> idx2; - for (unsigned i = 0; i < mVWidth / 2; i++) - { - idx2.push_back(C(flag ? i : i + mVWidth)); - } - for (unsigned i = mVWidth / 2; i < mVWidth; i++) - { - idx2.push_back(C(flag ? i + mVWidth / 2 : i)); - } - return VSHUFFLE(a, inter, ConstantVector::get(idx2)); - } - - // rdtsc buckets macros - void Builder::RDTSC_START(Value* pBucketMgr, Value* pId) - { - // @todo due to an issue with thread local storage propagation in llvm, we can only safely - // call into buckets framework when single threaded - if (KNOB_SINGLE_THREADED) - { - std::vector<Type*> args{ - PointerType::get(mInt32Ty, 0), // pBucketMgr - mInt32Ty // id - }; - - FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); - Function* pFunc = cast<Function>( -#if LLVM_VERSION_MAJOR >= 9 - JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy).getCallee()); -#else - JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); -#endif - if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == - nullptr) - { - sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", - (void*)&BucketManager_StartBucket); - } - - CALL(pFunc, {pBucketMgr, pId}); - } - } - - void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId) - { - // @todo due to an issue with thread local storage propagation in llvm, we can only safely - // call into buckets framework when single threaded - if (KNOB_SINGLE_THREADED) - { - std::vector<Type*> args{ - PointerType::get(mInt32Ty, 0), // pBucketMgr - mInt32Ty // id - }; - - FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); - Function* pFunc = cast<Function>( -#if LLVM_VERSION_MAJOR >= 9 - JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy).getCallee()); -#else - JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); -#endif - if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == - nullptr) - { - sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", - (void*)&BucketManager_StopBucket); - } - - CALL(pFunc, {pBucketMgr, pId}); - } - } - - uint32_t Builder::GetTypeSize(Type* pType) - { - if (pType->isStructTy()) - { - uint32_t numElems = pType->getStructNumElements(); - Type* pElemTy = pType->getStructElementType(0); - return numElems * GetTypeSize(pElemTy); - } - - if (pType->isArrayTy()) - { - uint32_t numElems = pType->getArrayNumElements(); - Type* pElemTy = pType->getArrayElementType(); - return numElems * GetTypeSize(pElemTy); - } - - if (pType->isIntegerTy()) - { - uint32_t bitSize = pType->getIntegerBitWidth(); - return bitSize / 8; - } - - if (pType->isFloatTy()) - { - return 4; - } - - if (pType->isHalfTy()) - { - return 2; - } - - if (pType->isDoubleTy()) - { - return 8; - } - - SWR_ASSERT(false, "Unimplemented type."); - return 0; - } -} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h deleted file mode 100644 index a7d69eaf9d0..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ /dev/null @@ -1,212 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file builder_misc.h - * - * @brief miscellaneous builder functions - * - * Notes: - * - ******************************************************************************/ -#pragma once - -Constant* C(bool i); -Constant* C(char i); -Constant* C(uint8_t i); -Constant* C(int i); -Constant* C(int64_t i); -Constant* C(uint64_t i); -Constant* C(uint16_t i); -Constant* C(uint32_t i); -Constant* C(float i); - -template <typename Ty> -Constant* C(const std::initializer_list<Ty>& constList) -{ - std::vector<Constant*> vConsts; - for (auto i : constList) - { - vConsts.push_back(C((Ty)i)); - } - return ConstantVector::get(vConsts); -} - -template <typename Ty> -Constant* C(const std::vector<Ty>& constList) -{ - std::vector<Constant*> vConsts; - for (auto i : constList) - { - vConsts.push_back(C((Ty)i)); - } - return ConstantVector::get(vConsts); -} - -template <typename Ty> -Constant* CA(LLVMContext& ctx, ArrayRef<Ty> constList) -{ - return ConstantDataArray::get(ctx, constList); -} - -template <typename Ty> -Constant* CInc(uint32_t base, uint32_t count) -{ - std::vector<Constant*> vConsts; - - for (uint32_t i = 0; i < count; i++) - { - vConsts.push_back(C((Ty)base)); - base++; - } - return ConstantVector::get(vConsts); -} - -Constant* PRED(bool pred); - -Value* VIMMED1(uint64_t i); -Value* VIMMED1_16(uint64_t i); - -Value* VIMMED1(int i); -Value* VIMMED1_16(int i); - -Value* VIMMED1(uint32_t i); -Value* VIMMED1_16(uint32_t i); - -Value* VIMMED1(float i); -Value* VIMMED1_16(float i); - -Value* VIMMED1(bool i); -Value* VIMMED1_16(bool i); - -Value* VUNDEF(Type* t); - -Value* VUNDEF_F(); -Value* VUNDEF_F_16(); - -Value* VUNDEF_I(); -Value* VUNDEF_I_16(); - -Value* VUNDEF(Type* ty, uint32_t size); - -Value* VUNDEF_IPTR(); - -Value* VBROADCAST(Value* src, const llvm::Twine& name = ""); -Value* VBROADCAST_16(Value* src); - -Value* VRCP(Value* va, const llvm::Twine& name = ""); -Value* VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY); - -uint32_t IMMED(Value* i); -int32_t S_IMMED(Value* i); - -CallInst* CALL(Value* Callee, const std::initializer_list<Value*>& args, const llvm::Twine& name = ""); -CallInst* CALL(Value* Callee) -{ -#if LLVM_VERSION_MAJOR >= 11 - // Not a great idea - we loose type info (Function) calling CALL - // and then we recast it here. Good for now, but needs to be - // more clean - optimally just always CALL a Function - return CALLA(FunctionCallee(cast<Function>(Callee))); -#else - return CALLA(Callee); -#endif -} -CallInst* CALL(Value* Callee, Value* arg); -CallInst* CALL2(Value* Callee, Value* arg1, Value* arg2); -CallInst* CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3); - -Value* MASK(Value* vmask); -Value* MASK_16(Value* vmask); - -Value* VMASK(Value* mask); -Value* VMASK_16(Value* mask); - -Value* VMOVMSK(Value* mask); - -////////////////////////////////////////////////////////////////////////// -/// @brief Float / Fixed-point conversions -////////////////////////////////////////////////////////////////////////// -// Signed -Value* VCVT_F32_FIXED_SI(Value* vFloat, - uint32_t numIntBits, - uint32_t numFracBits, - const llvm::Twine& name = ""); -Value* VCVT_FIXED_SI_F32(Value* vFixed, - uint32_t numIntBits, - uint32_t numFracBits, - const llvm::Twine& name = ""); -// Unsigned -Value* VCVT_F32_FIXED_UI(Value* vFloat, - uint32_t numIntBits, - uint32_t numFracBits, - const llvm::Twine& name = ""); -Value* VCVT_FIXED_UI_F32(Value* vFixed, - uint32_t numIntBits, - uint32_t numFracBits, - const llvm::Twine& name = ""); - -////////////////////////////////////////////////////////////////////////// -/// @brief functions that build IR to call x86 intrinsics directly, or -/// emulate them with other instructions if not available on the host -////////////////////////////////////////////////////////////////////////// - -Value* EXTRACT_16(Value* x, uint32_t imm); -Value* JOIN_16(Value* a, Value* b); - -Value* PSHUFB(Value* a, Value* b); -Value* PMOVSXBD(Value* a); -Value* PMOVSXWD(Value* a); -Value* CVTPH2PS(Value* a, const llvm::Twine& name = ""); -Value* CVTPS2PH(Value* a, Value* rounding); -Value* PMAXSD(Value* a, Value* b); -Value* PMINSD(Value* a, Value* b); -Value* PMAXUD(Value* a, Value* b); -Value* PMINUD(Value* a, Value* b); -Value* VABSPS(Value* a); -Value* FMADDPS(Value* a, Value* b, Value* c); - -Value* ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name = ""); -Value* FCLAMP(Value* src, Value* low, Value* high); -Value* FCLAMP(Value* src, float low, float high); - -CallInst* PRINT(const std::string& printStr); -CallInst* PRINT(const std::string& printStr, const std::initializer_list<Value*>& printArgs); - -Value* VPOPCNT(Value* a); - -Value* INT3() -{ - return DEBUGTRAP(); -} - - -Value* VEXTRACTI128(Value* a, Constant* imm8); -Value* VINSERTI128(Value* a, Value* b, Constant* imm8); - -// rdtsc buckets macros -void RDTSC_START(Value* pBucketMgr, Value* pId); -void RDTSC_STOP(Value* pBucketMgr, Value* pId); - -Value* CreateEntryAlloca(Function* pFunc, Type* pType); -Value* CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize); - -uint32_t GetTypeSize(Type* pType); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp deleted file mode 100644 index bd5f7588c91..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ /dev/null @@ -1,2332 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file fetch_jit.cpp - * - * @brief Implementation of the fetch jitter - * - * Notes: - * - ******************************************************************************/ -#include "jit_pch.hpp" -#include "builder_gfx_mem.h" -#include "jit_api.h" -#include "fetch_jit.h" -#include "gen_state_llvm.h" -#include "functionpasses/passes.h" - -//#define FETCH_DUMP_VERTEX 1 -using namespace llvm; -using namespace SwrJit; - -bool isComponentEnabled(ComponentEnable enableMask, uint8_t component); - -enum ConversionType -{ - CONVERT_NONE, - CONVERT_NORMALIZED, - CONVERT_USCALED, - CONVERT_SSCALED, - CONVERT_SFIXED, -}; - -////////////////////////////////////////////////////////////////////////// -/// Interface to Jitting a fetch shader -////////////////////////////////////////////////////////////////////////// -struct FetchJit : public BuilderGfxMem -{ - FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr), mpFetchInfo(NULL) {} - - Function* Create(const FETCH_COMPILE_STATE& fetchState); - - Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); - Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); - Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); - template <typename T> - Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex); - - // package up Shuffle*bpcGatherd args into a tuple for convenience - typedef std::tuple<Value*&, - Value*, - const Instruction::CastOps, - const ConversionType, - uint32_t&, - uint32_t&, - const ComponentEnable, - const ComponentControl (&)[4], - Value* (&)[4], - const uint32_t (&)[4]> - Shuffle8bpcArgs; - - void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args); - void Shuffle8bpcGatherd(Shuffle8bpcArgs& args); - - typedef std::tuple<Value* (&)[2], - Value*, - const Instruction::CastOps, - const ConversionType, - uint32_t&, - uint32_t&, - const ComponentEnable, - const ComponentControl (&)[4], - Value* (&)[4]> - Shuffle16bpcArgs; - - void Shuffle16bpcGather16(Shuffle16bpcArgs& args); - void Shuffle16bpcGather(Shuffle16bpcArgs& args); - - void StoreVertexElements(Value* pVtxOut, - const uint32_t outputElt, - const uint32_t numEltsToStore, - Value* (&vVertexElements)[4]); - - Value* GenerateCompCtrlVector(const ComponentControl ctrl); - - void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState, - Value* streams, - Value* vIndices, - Value* pVtxOut); - - bool IsOddFormat(SWR_FORMAT format); - bool IsUniformFormat(SWR_FORMAT format); - void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]); - void CreateGatherOddFormats( - SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]); - void ConvertFormat(SWR_FORMAT format, Value* texels[4]); - - Value* mpFetchInfo; -}; - -Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) -{ - std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); - fnName << ComputeCRC(0, &fetchState, sizeof(fetchState)); - - Function* fetch = Function::Create( - JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); - BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch); - - fetch->getParent()->setModuleIdentifier(fetch->getName()); - - IRB()->SetInsertPoint(entry); - - auto argitr = fetch->arg_begin(); - - // Fetch shader arguments - Value* privateContext = &*argitr; - ++argitr; - privateContext->setName("privateContext"); - SetPrivateContext(privateContext); - - mpWorkerData = &*argitr; - ++argitr; - mpWorkerData->setName("pWorkerData"); - - mpFetchInfo = &*argitr; - ++argitr; - mpFetchInfo->setName("fetchInfo"); - Value* pVtxOut = &*argitr; - pVtxOut->setName("vtxOutput"); - - uint32_t baseWidth = mVWidth; - - SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth); - - // Override builder target width to force 16-wide SIMD -#if USE_SIMD16_SHADERS - SetTargetWidth(16); -#endif - - pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0)); - - // SWR_FETCH_CONTEXT::pStreams - Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams}); - streams->setName("pStreams"); - - // SWR_FETCH_CONTEXT::pIndices - Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices}); - indices->setName("pIndices"); - - // SWR_FETCH_CONTEXT::pLastIndex - Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex}); - pLastIndex->setName("pLastIndex"); - - Value* vIndices; - switch (fetchState.indexType) - { - case R8_UINT: - indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); - if (fetchState.bDisableIndexOOBCheck) - { - vIndices = LOAD( - BITCAST(indices, PointerType::get(getVectorType(mInt8Ty, mpJitMgr->mVWidth), 0)), - {(uint32_t)0}); - vIndices = Z_EXT(vIndices, mSimdInt32Ty); - } - else - { - vIndices = GetSimdValid8bitIndices(indices, pLastIndex); - } - break; - case R16_UINT: - if (fetchState.bDisableIndexOOBCheck) - { - vIndices = LOAD( - BITCAST(indices, PointerType::get(getVectorType(mInt16Ty, mpJitMgr->mVWidth), 0)), - {(uint32_t)0}); - vIndices = Z_EXT(vIndices, mSimdInt32Ty); - } - else - { - vIndices = GetSimdValid16bitIndices(indices, pLastIndex); - } - break; - case R32_UINT: - (fetchState.bDisableIndexOOBCheck) - ? vIndices = LOAD(indices, - "", - PointerType::get(mSimdInt32Ty, 0), - MEM_CLIENT::GFX_MEM_CLIENT_FETCH) - : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); - break; // incoming type is already 32bit int - default: - vIndices = nullptr; - assert(false && "Unsupported index type"); - break; - } - - if (fetchState.bForceSequentialAccessEnable) - { - Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7}) - : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); - - // VertexData buffers are accessed sequentially, the index is equal to the vertex number - vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex})); - vIndices = ADD(vIndices, pOffsets); - } - - Value* vVertexId = vIndices; - if (fetchState.bVertexIDOffsetEnable) - { - // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally - // correct - Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); - Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex})); - vVertexId = ADD(vIndices, vBaseVertex); - vVertexId = ADD(vVertexId, vStartVertex); - } - - // store out vertex IDs - if (mVWidth == 16) - { - // store out in simd8 halves until core supports 16-wide natively - auto vVertexIdLo = EXTRACT_16(vVertexId, 0); - auto vVertexIdHi = EXTRACT_16(vVertexId, 1); - STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})); - STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})); - } - else if (mVWidth == 8) - { - STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})); - } - - // store out cut mask if enabled - if (fetchState.bEnableCutIndex) - { - Value* vCutIndex = VIMMED1(fetchState.cutIndex); - Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); - - if (mVWidth == 16) - { - auto cutMaskLo = EXTRACT_16(cutMask, 0); - auto cutMaskHi = EXTRACT_16(cutMask, 1); - STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask})); - STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2})); - } - else if (mVWidth == 8) - { - STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask})); - } - } - - // Fetch attributes from memory and output to a simdvertex struct - JitGatherVertices(fetchState, streams, vIndices, pVtxOut); - - RET_VOID(); - - JitManager::DumpToFile(fetch, "src"); - -#if defined(_DEBUG) - verifyFunction(*fetch); -#endif - - ::FunctionPassManager setupPasses(JM()->mpCurrentModule); - - ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) - setupPasses.add(createBreakCriticalEdgesPass()); - setupPasses.add(createCFGSimplificationPass()); - setupPasses.add(createEarlyCSEPass()); - setupPasses.add(createPromoteMemoryToRegisterPass()); - - setupPasses.run(*fetch); - - JitManager::DumpToFile(fetch, "se"); - - ::FunctionPassManager optPasses(JM()->mpCurrentModule); - - ///@todo Haven't touched these either. Need to remove some of these and add others. - optPasses.add(createCFGSimplificationPass()); - optPasses.add(createEarlyCSEPass()); - optPasses.add(createInstructionCombiningPass()); -#if LLVM_VERSION_MAJOR <= 11 - optPasses.add(createConstantPropagationPass()); -#endif - optPasses.add(createSCCPPass()); - optPasses.add(createAggressiveDCEPass()); - - optPasses.run(*fetch); - - optPasses.add(createLowerX86Pass(this)); - optPasses.run(*fetch); - - JitManager::DumpToFile(fetch, "opt"); - - - // Revert 16-wide override -#if USE_SIMD16_SHADERS - SetTargetWidth(baseWidth); -#endif - - return fetch; -} - -// returns true for odd formats that require special state.gather handling -bool FetchJit::IsOddFormat(SWR_FORMAT format) -{ - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64) - { - return true; - } - return false; -} - -// format is uniform if all components are the same size and type -bool FetchJit::IsUniformFormat(SWR_FORMAT format) -{ - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - uint32_t bpc0 = info.bpc[0]; - uint32_t type0 = info.type[0]; - - for (uint32_t c = 1; c < info.numComps; ++c) - { - if (bpc0 != info.bpc[c] || type0 != info.type[c]) - { - return false; - } - } - return true; -} - -// unpacks components based on format -// foreach component in the pixel -// mask off everything but this component -// shift component to LSB -void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]) -{ - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - - uint32_t bitOffset = 0; - for (uint32_t c = 0; c < info.numComps; ++c) - { - uint32_t swizzledIndex = info.swizzle[c]; - uint32_t compBits = info.bpc[c]; - uint32_t bitmask = ((1 << compBits) - 1) << bitOffset; - Value* comp = AND(vInput, bitmask); - comp = LSHR(comp, bitOffset); - - result[swizzledIndex] = comp; - bitOffset += compBits; - } -} - -// gather for odd component size formats -// gather SIMD full pixels per lane then shift/mask to move each component to their -// own vector -void FetchJit::CreateGatherOddFormats( - SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4]) -{ - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - - // only works if pixel size is <= 32bits - SWR_ASSERT(info.bpp <= 32); - - Value* pGather; - if (info.bpp == 32) - { - pGather = - GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH); - } - else - { - // Can't use 32-bit gather for items less than 32-bits, could cause page faults. - Value* pMem = ALLOCA(mSimdInt32Ty); - STORE(VIMMED1(0u), pMem); - - Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy); - - for (uint32_t lane = 0; lane < mVWidth; ++lane) - { - // Get index - Value* index = VEXTRACT(pOffsets, C(lane)); - Value* mask = VEXTRACT(pMask, C(lane)); - - // use branch around load based on mask - // Needed to avoid page-faults on unmasked lanes - BasicBlock* pCurrentBB = IRB()->GetInsertBlock(); - BasicBlock* pMaskedLoadBlock = - BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent()); - BasicBlock* pEndLoadBB = - BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent()); - - COND_BR(mask, pMaskedLoadBlock, pEndLoadBB); - - JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock); - - switch (info.bpp) - { - case 8: - { - Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0)); - Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType())); - STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); - break; - } - - case 16: - { - Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0)); - Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType())); - STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); - break; - } - break; - - case 24: - { - // First 16-bits of data - Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0)); - Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType())); - STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); - - // Last 8-bits of data - pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0)); - xpSrc = ADD(xpSrc, C((int64_t)2)); - STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); - break; - } - - default: - SWR_INVALID("Shouldn't have BPP = %d now", info.bpp); - break; - } - - BR(pEndLoadBB); - JM()->mBuilder.SetInsertPoint(pEndLoadBB); - } - - pGather = LOAD(pMem); - } - - for (uint32_t comp = 0; comp < 4; ++comp) - { - pResult[comp] = VIMMED1((int)info.defaults[comp]); - } - - UnpackComponents(format, pGather, pResult); - - // cast to fp32 - pResult[0] = BITCAST(pResult[0], mSimdFP32Ty); - pResult[1] = BITCAST(pResult[1], mSimdFP32Ty); - pResult[2] = BITCAST(pResult[2], mSimdFP32Ty); - pResult[3] = BITCAST(pResult[3], mSimdFP32Ty); -} - -void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4]) -{ - const SWR_FORMAT_INFO& info = GetFormatInfo(format); - - for (uint32_t c = 0; c < info.numComps; ++c) - { - uint32_t compIndex = info.swizzle[c]; - - // skip any conversion on UNUSED components - if (info.type[c] == SWR_TYPE_UNUSED) - { - continue; - } - - if (info.isNormalized[c]) - { - if (info.type[c] == SWR_TYPE_SNORM) - { - /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to - /// -1.0f. - - /// result = c * (1.0f / (2^(n-1) - 1); - uint32_t n = info.bpc[c]; - uint32_t pow2 = 1 << (n - 1); - float scale = 1.0f / (float)(pow2 - 1); - Value* vScale = VIMMED1(scale); - texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); - texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); - texels[compIndex] = FMUL(texels[compIndex], vScale); - } - else - { - SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM); - - /// result = c * (1.0f / (2^n - 1)) - uint32_t n = info.bpc[c]; - uint32_t pow2 = 1 << n; - // special case 24bit unorm format, which requires a full divide to meet ULP - // requirement - if (n == 24) - { - float scale = (float)(pow2 - 1); - Value* vScale = VIMMED1(scale); - texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); - texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); - texels[compIndex] = FDIV(texels[compIndex], vScale); - } - else - { - float scale = 1.0f / (float)(pow2 - 1); - Value* vScale = VIMMED1(scale); - texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); - texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty); - texels[compIndex] = FMUL(texels[compIndex], vScale); - } - } - continue; - } - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Loads attributes from memory using AVX2 GATHER(s) -/// @param fetchState - info about attributes to be fetched from memory -/// @param streams - value pointer to the current vertex stream -/// @param vIndices - vector value of indices to gather -/// @param pVtxOut - value pointer to output simdvertex struct -void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState, - Value* streams, - Value* vIndices, - Value* pVtxOut) -{ - uint32_t currentVertexElement = 0; - uint32_t outputElt = 0; - Value* vVertexElements[4]; - - Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); - Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); - Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); - Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); - curInstance->setName("curInstance"); - - for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1) - { - const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt]; - - // skip element if all components are disabled - if (ied.ComponentPacking == ComponentEnable::NONE) - { - continue; - } - - const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format); - SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices."); - uint32_t bpc = - info.bpp / - info.numComps; ///@todo Code below assumes all components are same size. Need to fix. - - Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData}); - - Value* stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); - Value* vStride = VBROADCAST(stride); - - // max vertex index that is fully in bounds - Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); - maxVertex = LOAD(maxVertex); - - Value* minVertex = NULL; - if (fetchState.bPartialVertexBuffer) - { - // min vertex index for low bounds OOB checking - minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)}); - minVertex = LOAD(minVertex); - } - - if (fetchState.bInstanceIDOffsetEnable) - { - // the InstanceID (curInstance) value is offset by StartInstanceLocation - curInstance = ADD(curInstance, startInstance); - } - - Value* vCurIndices; - Value* startOffset; - Value* vInstanceStride = VIMMED1(0); - - if (ied.InstanceEnable) - { - Value* stepRate = C(ied.InstanceAdvancementState); - - // prevent a div by 0 for 0 step rate - Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); - stepRate = SELECT(isNonZeroStep, stepRate, C(1)); - - // calc the current offset into instanced data buffer - Value* calcInstance = UDIV(curInstance, stepRate); - - // if step rate is 0, every instance gets instance 0 - calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); - - vCurIndices = VBROADCAST(calcInstance); - startOffset = startInstance; - } - else if (ied.InstanceStrideEnable) - { - // grab the instance advancement state, determines stride in bytes from one instance to - // the next - Value* stepRate = C(ied.InstanceAdvancementState); - vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); - - // offset indices by baseVertex - vCurIndices = ADD(vIndices, vBaseVertex); - - startOffset = startVertex; - SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); - } - else - { - // offset indices by baseVertex - vCurIndices = ADD(vIndices, vBaseVertex); - startOffset = startVertex; - } - - // All of the OOB calculations are in vertices, not VB offsets, to prevent having to - // do 64bit address offset calculations. - - // calculate byte offset to the start of the VB - Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); - - // VGATHER* takes an *i8 src pointer so that's what stream is - Value* pStreamBaseGFX = ADD(stream, baseOffset); - - // if we have a start offset, subtract from max vertex. Used for OOB check - maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); - Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0)); - // if we have a negative value, we're already OOB. clamp at 0. - maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty)); - - if (fetchState.bPartialVertexBuffer) - { - // similary for min vertex - minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); - Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0)); - minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty)); - } - - // Load the in bounds size of a partially valid vertex - Value* partialInboundsSize = - GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); - partialInboundsSize = LOAD(partialInboundsSize); - Value* vPartialVertexSize = VBROADCAST(partialInboundsSize); - Value* vBpp = VBROADCAST(C(info.Bpp)); - Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); - - // is the element is <= the partially valid size - Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); - - // override cur indices with 0 if pitch is 0 - Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); - vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); - - // are vertices partially OOB? - Value* vMaxVertex = VBROADCAST(maxVertex); - Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); - - // are vertices fully in bounds? - Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); - - Value* vGatherMask; - if (fetchState.bPartialVertexBuffer) - { - // are vertices below minVertex limit? - Value* vMinVertex = VBROADCAST(minVertex); - Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); - - // only fetch lanes that pass both tests - vGatherMask = AND(vMaxGatherMask, vMinGatherMask); - } - else - { - vGatherMask = vMaxGatherMask; - } - - // blend in any partially OOB indices that have valid elements - vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); - - // calculate the actual offsets into the VB - Value* vOffsets = MUL(vCurIndices, vStride); - vOffsets = ADD(vOffsets, vAlignmentOffsets); - - // if instance stride enable is: - // true - add product of the instanceID and advancement state to the offset into the VB - // false - value of vInstanceStride has been initialized to zero - vOffsets = ADD(vOffsets, vInstanceStride); - - // Packing and component control - ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; - const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0, - (ComponentControl)ied.ComponentControl1, - (ComponentControl)ied.ComponentControl2, - (ComponentControl)ied.ComponentControl3}; - - // Special gather/conversion for formats without equal component sizes - if (IsOddFormat((SWR_FORMAT)ied.Format)) - { - Value* pResults[4]; - CreateGatherOddFormats( - (SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults); - ConvertFormat((SWR_FORMAT)ied.Format, pResults); - - for (uint32_t c = 0; c < 4; c += 1) - { - if (isComponentEnabled(compMask, c)) - { - vVertexElements[currentVertexElement++] = pResults[c]; - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } - } - } - else if (info.type[0] == SWR_TYPE_FLOAT) - { - ///@todo: support 64 bit vb accesses - Value* gatherSrc = VIMMED1(0.0f); - - SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), - "Unsupported format for standard gather fetch."); - - // Gather components from memory to store in a simdvertex structure - switch (bpc) - { - case 16: - { - Value* vGatherResult[2]; - - // if we have at least one component out of x or y to fetch - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - } - - // if we have at least one component out of z or w to fetch - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - // offset base to the next components(zw) in the vertex to gather - pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); - - vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } - - // if we have at least one component to shuffle into place - if (compMask) - { - Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, - pVtxOut, - Instruction::CastOps::FPExt, - CONVERT_NONE, - currentVertexElement, - outputElt, - compMask, - compCtrl, - vVertexElements); - - // Shuffle gathered components into place in simdvertex struct - mVWidth == 16 ? Shuffle16bpcGather16(args) - : Shuffle16bpcGather(args); // outputs to vVertexElements ref - } - } - break; - case 32: - { - for (uint32_t i = 0; i < 4; i += 1) - { - if (isComponentEnabled(compMask, i)) - { - // if we need to gather the component - if (compCtrl[i] == StoreSrc) - { - // Gather a SIMD of vertices - // APIs allow a 4GB range for offsets - // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :( - // Add 2GB to the base pointer and 2GB to the offsets. This makes - // "negative" (large) offsets into positive offsets and small offsets - // into negative offsets. - Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000)); - vVertexElements[currentVertexElement++] = - GATHERPS(gatherSrc, - ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)), - vNewOffsets, - vGatherMask, - 1, - MEM_CLIENT::GFX_MEM_CLIENT_FETCH); - } - else - { - vVertexElements[currentVertexElement++] = - GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } - - // offset base to the next component in the vertex to gather - pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); - } - } - break; - case 64: - { - for (uint32_t i = 0; i < 4; i += 1) - { - if (isComponentEnabled(compMask, i)) - { - // if we need to gather the component - if (compCtrl[i] == StoreSrc) - { - Value* vShufLo; - Value* vShufHi; - Value* vShufAll; - - if (mVWidth == 8) - { - vShufLo = C({0, 1, 2, 3}); - vShufHi = C({4, 5, 6, 7}); - vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7}); - } - else - { - SWR_ASSERT(mVWidth == 16); - vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7}); - vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15}); - vShufAll = - C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); - } - - Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo); - Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi); - - Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo); - Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi); - - Value* vZeroDouble = VECTOR_SPLAT( - mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); - - Value* pGatherLo = - GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo); - Value* pGatherHi = - GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi); - - Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll); - pGather = FP_TRUNC(pGather, mSimdFP32Ty); - - vVertexElements[currentVertexElement++] = pGather; - } - else - { - vVertexElements[currentVertexElement++] = - GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } - - // offset base to the next component in the vertex to gather - pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8)); - } - } - break; - default: - SWR_INVALID("Tried to fetch invalid FP format"); - break; - } - } - else - { - Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd; - ConversionType conversionType = CONVERT_NONE; - - SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), - "Unsupported format for standard gather fetch."); - - switch (info.type[0]) - { - case SWR_TYPE_UNORM: - conversionType = CONVERT_NORMALIZED; - case SWR_TYPE_UINT: - extendCastType = Instruction::CastOps::ZExt; - break; - case SWR_TYPE_SNORM: - conversionType = CONVERT_NORMALIZED; - case SWR_TYPE_SINT: - extendCastType = Instruction::CastOps::SExt; - break; - case SWR_TYPE_USCALED: - conversionType = CONVERT_USCALED; - extendCastType = Instruction::CastOps::UIToFP; - break; - case SWR_TYPE_SSCALED: - conversionType = CONVERT_SSCALED; - extendCastType = Instruction::CastOps::SIToFP; - break; - case SWR_TYPE_SFIXED: - conversionType = CONVERT_SFIXED; - extendCastType = Instruction::CastOps::SExt; - break; - default: - break; - } - - // value substituted when component of gather is masked - Value* gatherSrc = VIMMED1(0); - - // Gather components from memory to store in a simdvertex structure - switch (bpc) - { - case 8: - { - // if we have at least one component to fetch - if (compMask) - { - Value* vGatherResult = GATHERDD(gatherSrc, - pStreamBaseGFX, - vOffsets, - vGatherMask, - 1, - MEM_CLIENT::GFX_MEM_CLIENT_FETCH); - // e.g. result of an 8x32bit integer gather for 8bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw - - Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, - pVtxOut, - extendCastType, - conversionType, - currentVertexElement, - outputElt, - compMask, - compCtrl, - vVertexElements, - info.swizzle); - - // Shuffle gathered components into place in simdvertex struct - mVWidth == 16 ? Shuffle8bpcGatherd16(args) - : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref - } - } - break; - case 16: - { - Value* vGatherResult[2]; - - // if we have at least one component out of x or y to fetch - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - vGatherResult[0] = GATHERDD(gatherSrc, - pStreamBaseGFX, - vOffsets, - vGatherMask, - 1, - MEM_CLIENT::GFX_MEM_CLIENT_FETCH); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - } - - // if we have at least one component out of z or w to fetch - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - // offset base to the next components(zw) in the vertex to gather - pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); - - vGatherResult[1] = GATHERDD(gatherSrc, - pStreamBaseGFX, - vOffsets, - vGatherMask, - 1, - MEM_CLIENT::GFX_MEM_CLIENT_FETCH); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } - - // if we have at least one component to shuffle into place - if (compMask) - { - Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, - pVtxOut, - extendCastType, - conversionType, - currentVertexElement, - outputElt, - compMask, - compCtrl, - vVertexElements); - - // Shuffle gathered components into place in simdvertex struct - mVWidth == 16 ? Shuffle16bpcGather16(args) - : Shuffle16bpcGather(args); // outputs to vVertexElements ref - } - } - break; - case 32: - { - // Gathered components into place in simdvertex struct - for (uint32_t i = 0; i < 4; i++) - { - if (isComponentEnabled(compMask, i)) - { - // if we need to gather the component - if (compCtrl[i] == StoreSrc) - { - Value* pGather = GATHERDD(gatherSrc, - pStreamBaseGFX, - vOffsets, - vGatherMask, - 1, - MEM_CLIENT::GFX_MEM_CLIENT_FETCH); - - if (conversionType == CONVERT_USCALED) - { - pGather = UI_TO_FP(pGather, mSimdFP32Ty); - } - else if (conversionType == CONVERT_SSCALED) - { - pGather = SI_TO_FP(pGather, mSimdFP32Ty); - } - else if (conversionType == CONVERT_SFIXED) - { - pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), - VBROADCAST(C(1 / 65536.0f))); - } - - vVertexElements[currentVertexElement++] = pGather; - - // e.g. result of a single 8x32bit integer gather for 32bit components - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx - } - else - { - vVertexElements[currentVertexElement++] = - GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } - - // offset base to the next component in the vertex to gather - pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); - } - } - break; - } - } - } - - // if we have a partially filled vVertexElement struct, output it - if (currentVertexElement > 0) - { - StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements); - } -} - - -typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData); - -template <typename T> -void GetSimdValidIndicesGfx(gfxptr_t indices, - gfxptr_t lastIndex, - uint32_t vWidth, - PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, - void* pdc, - uint32_t* outIndices, - void* pWorkerData) -{ - SWR_ASSERT(outIndices != nullptr); - - gfxptr_t indexPtr = indices; - for (int64_t lane = 0; lane < vWidth; lane++) - { - uint32_t index = 0; - - if (indexPtr < lastIndex) - { - // translate indexPtr and load from it - T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData); - SWR_ASSERT(addr != nullptr); - index = *addr; - } - - // index to 32 bits and insert into the correct simd lane - outIndices[lane] = index; - - indexPtr += sizeof(T); - } -} - -void GetSimdValid8bitIndicesGfx(gfxptr_t indices, - gfxptr_t lastIndex, - uint32_t vWidth, - PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, - void* pdc, - uint32_t* outIndices, - void* pWorkerData) -{ - GetSimdValidIndicesGfx<uint8_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData); -} - -void GetSimdValid16bitIndicesGfx(gfxptr_t indices, - gfxptr_t lastIndex, - uint32_t vWidth, - PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, - void* pdc, - uint32_t* outIndices, - void* pWorkerData) -{ - GetSimdValidIndicesGfx<uint16_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData); -} - - -template <typename T> -Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex) -{ - SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, - "Function expects gfxptr_t for both input parameters."); - - Type* Ty = nullptr; - - static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t), - "Unsupported type for use with GetSimdValidIndicesHelper<T>"); - constexpr bool bSize = (sizeof(T) == sizeof(uint16_t)); - if (bSize) - { - Ty = mInt16PtrTy; - } - else if (sizeof(T) == sizeof(uint8_t)) - { - Ty = mInt8PtrTy; - } - else - { - SWR_ASSERT(false, "This should never happen as per static_assert above."); - } - - Value* vIndices = VUNDEF_I(); - - { - // store 0 index on stack to be used to conditionally load from if index address is OOB - Value* pZeroIndex = ALLOCA(Ty->getPointerElementType()); - STORE(C((T)0), pZeroIndex); - - // Load a SIMD of index pointers - for (int64_t lane = 0; lane < mVWidth; lane++) - { - // Calculate the address of the requested index - Value* pIndex = GEP(pIndices, C(lane), Ty); - - pLastIndex = INT_TO_PTR(pLastIndex, Ty); - - // check if the address is less than the max index, - Value* mask = ICMP_ULT(pIndex, pLastIndex); - - // if valid, load the index. if not, load 0 from the stack - Value* pValid = SELECT(mask, pIndex, pZeroIndex); - Value* index = LOAD(pValid, "valid index", Ty, MEM_CLIENT::GFX_MEM_CLIENT_FETCH); - - // zero extended index to 32 bits and insert into the correct simd lane - index = Z_EXT(index, mInt32Ty); - vIndices = VINSERT(vIndices, index, lane); - } - } - - return vIndices; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Loads a simd of valid indices. OOB indices are set to 0 -/// *Note* have to do 8bit index checking in scalar until we have AVX-512 -/// support -/// @param pIndices - pointer to 8 bit indices -/// @param pLastIndex - pointer to last valid index -Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) -{ - return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Loads a simd of valid indices. OOB indices are set to 0 -/// *Note* have to do 16bit index checking in scalar until we have AVX-512 -/// support -/// @param pIndices - pointer to 16 bit indices -/// @param pLastIndex - pointer to last valid index -Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) -{ - return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Loads a simd of valid indices. OOB indices are set to 0 -/// @param pIndices - pointer to 32 bit indices -/// @param pLastIndex - pointer to last valid index -Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) -{ - DataLayout dL(JM()->mpCurrentModule); - Value* iLastIndex = pLastIndex; - Value* iIndices = pIndices; - - // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index) - Value* numIndicesLeft = SUB(iLastIndex, iIndices); - numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty); - numIndicesLeft = SDIV(numIndicesLeft, C(4)); - - // create a vector of index counts from the base index ptr passed into the fetch - Constant* vIndexOffsets; - if (mVWidth == 8) - { - vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7}); - } - else - { - vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); - } - - // compare index count to the max valid index - // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load - // vIndexOffsets 0 1 2 3 4 5 6 7 - // ------------------------------ - // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass - // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 - Value* vMaxIndex = VBROADCAST(numIndicesLeft); - Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets); - - // Load the indices; OOB loads 0 - return MASKED_LOAD(pIndices, - 4, - vIndexMask, - VIMMED1(0), - "vIndices", - PointerType::get(mSimdInt32Ty, 0), - MEM_CLIENT::GFX_MEM_CLIENT_FETCH); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, -/// denormalizes if needed, converts to F32 if needed, and positions in -// the proper SIMD rows to be output to the simdvertex structure -/// @param args: (tuple of args, listed below) -/// @param vGatherResult - 8 gathered 8bpc vertices -/// @param pVtxOut - base pointer to output simdvertex struct -/// @param extendType - sign extend or zero extend -/// @param bNormalized - do we need to denormalize? -/// @param currentVertexElement - reference to the current vVertexElement -/// @param outputElt - reference to the current offset from simdvertex we're o -/// @param compMask - component packing mask -/// @param compCtrl - component control val -/// @param vVertexElements[4] - vertex components to output -/// @param swizzle[4] - component swizzle location -void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args) -{ - // Unpack tuple args - Value*& vGatherResult = std::get<0>(args); - Value* pVtxOut = std::get<1>(args); - const Instruction::CastOps extendType = std::get<2>(args); - const ConversionType conversionType = std::get<3>(args); - uint32_t& currentVertexElement = std::get<4>(args); - uint32_t& outputElt = std::get<5>(args); - const ComponentEnable compMask = std::get<6>(args); - const ComponentControl(&compCtrl)[4] = std::get<7>(args); - Value*(&vVertexElements)[4] = std::get<8>(args); - const uint32_t(&swizzle)[4] = std::get<9>(args); - - // cast types - Type* vGatherTy = getVectorType(mInt32Ty, 8); - Type* v32x8Ty = getVectorType(mInt8Ty, 32); - - // have to do extra work for sign extending - if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)) - { - Type* v16x8Ty = getVectorType(mInt8Ty, 16); // 8x16bit ints in a 128bit lane - Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2); - - // shuffle mask, including any swizzling - const char x = (char)swizzle[0]; - const char y = (char)swizzle[1]; - const char z = (char)swizzle[2]; - const char w = (char)swizzle[3]; - Value* vConstMask = C<char>( - {char(x), char(x + 4), char(x + 8), char(x + 12), char(y), char(y + 4), - char(y + 8), char(y + 12), char(z), char(z + 4), char(z + 8), char(z + 12), - char(w), char(w + 4), char(w + 8), char(w + 12), char(x), char(x + 4), - char(x + 8), char(x + 12), char(y), char(y + 4), char(y + 8), char(y + 12), - char(z), char(z + 4), char(z + 8), char(z + 12), char(w), char(w + 4), - char(w + 8), char(w + 12)}); - - // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. - - Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0); - Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1); - - Value* vShufResult_lo = - BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); - Value* vShufResult_hi = - BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); - - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww - - Value* vi128XY_lo = nullptr; - Value* vi128XY_hi = nullptr; - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - vi128XY_lo = BITCAST( - VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), - v128Ty); - vi128XY_hi = BITCAST( - VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), - v128Ty); - - // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) - } - - // do the same for zw components - Value* vi128ZW_lo = nullptr; - Value* vi128ZW_hi = nullptr; - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - vi128ZW_lo = BITCAST( - VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), - v128Ty); - vi128ZW_hi = BITCAST( - VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), - v128Ty); - } - - // init denormalize variables if needed - Instruction::CastOps fpCast; - Value* conversionFactor; - - switch (conversionType) - { - case CONVERT_NORMALIZED: - fpCast = Instruction::CastOps::SIToFP; - conversionFactor = VIMMED1((float)(1.0 / 127.0)); - break; - case CONVERT_SSCALED: - fpCast = Instruction::CastOps::SIToFP; - conversionFactor = VIMMED1((float)(1.0)); - break; - case CONVERT_USCALED: - assert(false && "Type should not be sign extended!"); - conversionFactor = nullptr; - break; - default: - assert(conversionType == CONVERT_NONE); - conversionFactor = nullptr; - break; - } - - // sign extend all enabled components. If we have a fill vVertexElements, output to current - // simdvertex - for (uint32_t i = 0; i < 4; i++) - { - if (isComponentEnabled(compMask, i)) - { - if (compCtrl[i] == ComponentControl::StoreSrc) - { - // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 - uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; - Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; - - // sign extend - Value* temp_lo = - PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty)); - Value* temp_hi = - PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty)); - - Value* temp = JOIN_16(temp_lo, temp_hi); - - // denormalize if needed - if (conversionType != CONVERT_NONE) - { - temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor); - } - - vVertexElements[currentVertexElement] = temp; - - currentVertexElement += 1; - } - else - { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } - } - } - // else zero extend - else if ((extendType == Instruction::CastOps::ZExt) || - (extendType == Instruction::CastOps::UIToFP)) - { - // init denormalize variables if needed - Instruction::CastOps fpCast; - Value* conversionFactor; - - switch (conversionType) - { - case CONVERT_NORMALIZED: - fpCast = Instruction::CastOps::UIToFP; - conversionFactor = VIMMED1((float)(1.0 / 255.0)); - break; - case CONVERT_USCALED: - fpCast = Instruction::CastOps::UIToFP; - conversionFactor = VIMMED1((float)(1.0)); - break; - case CONVERT_SSCALED: - assert(false && "Type should not be zero extended!"); - conversionFactor = nullptr; - break; - default: - assert(conversionType == CONVERT_NONE); - conversionFactor = nullptr; - break; - } - - // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits - for (uint32_t i = 0; i < 4; i++) - { - if (isComponentEnabled(compMask, i)) - { - if (compCtrl[i] == ComponentControl::StoreSrc) - { - // pshufb masks for each component - Value* vConstMask; - switch (swizzle[i]) - { - case 0: - // x shuffle mask - vConstMask = - C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, - 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); - break; - case 1: - // y shuffle mask - vConstMask = - C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, - 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); - break; - case 2: - // z shuffle mask - vConstMask = - C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, - 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); - break; - case 3: - // w shuffle mask - vConstMask = - C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, - 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); - break; - default: - assert(false && "Invalid component"); - vConstMask = nullptr; - break; - } - - Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0); - Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1); - - Value* temp_lo = - BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); - Value* temp_hi = - BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); - - // after pshufb for x channel - // 256i - 0 1 2 3 4 5 6 7 - // x000 x000 x000 x000 x000 x000 x000 x000 - - Value* temp = JOIN_16(temp_lo, temp_hi); - - // denormalize if needed - if (conversionType != CONVERT_NONE) - { - temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor); - } - - vVertexElements[currentVertexElement] = temp; - - currentVertexElement += 1; - } - else - { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } - } - } - else - { - SWR_INVALID("Unsupported conversion type"); - } -} - -void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args) -{ - // Unpack tuple args - Value*& vGatherResult = std::get<0>(args); - Value* pVtxOut = std::get<1>(args); - const Instruction::CastOps extendType = std::get<2>(args); - const ConversionType conversionType = std::get<3>(args); - uint32_t& currentVertexElement = std::get<4>(args); - uint32_t& outputElt = std::get<5>(args); - const ComponentEnable compMask = std::get<6>(args); - const ComponentControl(&compCtrl)[4] = std::get<7>(args); - Value*(&vVertexElements)[4] = std::get<8>(args); - const uint32_t(&swizzle)[4] = std::get<9>(args); - - // cast types - Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - - for (uint32_t i = 0; i < 4; i++) - { - if (!isComponentEnabled(compMask, i)) - continue; - - if (compCtrl[i] == ComponentControl::StoreSrc) - { -#if LLVM_VERSION_MAJOR >= 11 - using MaskType = int32_t; -#else - using MaskType = uint32_t; -#endif - std::vector<MaskType> vShuffleMasks[4] = { - {0, 4, 8, 12, 16, 20, 24, 28}, // x - {1, 5, 9, 13, 17, 21, 25, 29}, // y - {2, 6, 10, 14, 18, 22, 26, 30}, // z - {3, 7, 11, 15, 19, 23, 27, 31}, // w - }; - - Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty), - UndefValue::get(v32x8Ty), - vShuffleMasks[swizzle[i]]); - - if ((extendType == Instruction::CastOps::SExt) || - (extendType == Instruction::CastOps::SIToFP)) - { - switch (conversionType) - { - case CONVERT_NORMALIZED: - val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0))); - break; - case CONVERT_SSCALED: - val = SI_TO_FP(val, mSimdFP32Ty); - break; - case CONVERT_USCALED: - SWR_INVALID("Type should not be sign extended!"); - break; - default: - SWR_ASSERT(conversionType == CONVERT_NONE); - val = S_EXT(val, mSimdInt32Ty); - break; - } - } - else if ((extendType == Instruction::CastOps::ZExt) || - (extendType == Instruction::CastOps::UIToFP)) - { - switch (conversionType) - { - case CONVERT_NORMALIZED: - val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0))); - break; - case CONVERT_SSCALED: - SWR_INVALID("Type should not be zero extended!"); - break; - case CONVERT_USCALED: - val = UI_TO_FP(val, mSimdFP32Ty); - break; - default: - SWR_ASSERT(conversionType == CONVERT_NONE); - val = Z_EXT(val, mSimdInt32Ty); - break; - } - } - else - { - SWR_INVALID("Unsupported conversion type"); - } - - vVertexElements[currentVertexElement++] = val; - } - else - { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, -/// denormalizes if needed, converts to F32 if needed, and positions in -// the proper SIMD rows to be output to the simdvertex structure -/// @param args: (tuple of args, listed below) -/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index -/// @param pVtxOut - base pointer to output simdvertex struct -/// @param extendType - sign extend or zero extend -/// @param bNormalized - do we need to denormalize? -/// @param currentVertexElement - reference to the current vVertexElement -/// @param outputElt - reference to the current offset from simdvertex we're o -/// @param compMask - component packing mask -/// @param compCtrl - component control val -/// @param vVertexElements[4] - vertex components to output -void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args) -{ - // Unpack tuple args - Value*(&vGatherResult)[2] = std::get<0>(args); - Value* pVtxOut = std::get<1>(args); - const Instruction::CastOps extendType = std::get<2>(args); - const ConversionType conversionType = std::get<3>(args); - uint32_t& currentVertexElement = std::get<4>(args); - uint32_t& outputElt = std::get<5>(args); - const ComponentEnable compMask = std::get<6>(args); - const ComponentControl(&compCtrl)[4] = std::get<7>(args); - Value*(&vVertexElements)[4] = std::get<8>(args); - - // cast types - Type* vGatherTy = getVectorType(mInt32Ty, 8); - Type* v32x8Ty = getVectorType(mInt8Ty, 32); - - // have to do extra work for sign extending - if ((extendType == Instruction::CastOps::SExt) || - (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) - { - // is this PP float? - bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; - - Type* v8x16Ty = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2); - - // shuffle mask - Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); - Value* vi128XY_lo = nullptr; - Value* vi128XY_hi = nullptr; - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for - // now.. - - Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty); - Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty); - - Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy); - Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy); - - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - - vi128XY_lo = BITCAST( - VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), - v128bitTy); - vi128XY_hi = BITCAST( - VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), - v128bitTy); - - // after PERMD: move and pack xy components into each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy - } - - // do the same for zw components - Value* vi128ZW_lo = nullptr; - Value* vi128ZW_hi = nullptr; - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty); - Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty); - - Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy); - Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy); - - vi128ZW_lo = BITCAST( - VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), - v128bitTy); - vi128ZW_hi = BITCAST( - VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), - v128bitTy); - } - - // init denormalize variables if needed - Instruction::CastOps IntToFpCast; - Value* conversionFactor; - - switch (conversionType) - { - case CONVERT_NORMALIZED: - IntToFpCast = Instruction::CastOps::SIToFP; - conversionFactor = VIMMED1((float)(1.0 / 32767.0)); - break; - case CONVERT_SSCALED: - IntToFpCast = Instruction::CastOps::SIToFP; - conversionFactor = VIMMED1((float)(1.0)); - break; - case CONVERT_USCALED: - assert(false && "Type should not be sign extended!"); - conversionFactor = nullptr; - break; - default: - assert(conversionType == CONVERT_NONE); - conversionFactor = nullptr; - break; - } - - // sign extend all enabled components. If we have a fill vVertexElements, output to current - // simdvertex - for (uint32_t i = 0; i < 4; i++) - { - if (isComponentEnabled(compMask, i)) - { - if (compCtrl[i] == ComponentControl::StoreSrc) - { - // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 - uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; - Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; - - if (bFP) - { - // extract 128 bit lanes to sign extend each component - Value* temp_lo = - CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); - Value* temp_hi = - CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); - - vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); - } - else - { - // extract 128 bit lanes to sign extend each component - Value* temp_lo = - PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); - Value* temp_hi = - PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); - - Value* temp = JOIN_16(temp_lo, temp_hi); - - // denormalize if needed - if (conversionType != CONVERT_NONE) - { - temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor); - } - - vVertexElements[currentVertexElement] = temp; - } - - currentVertexElement += 1; - } - else - { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } - } - } - // else zero extend - else if ((extendType == Instruction::CastOps::ZExt) || - (extendType == Instruction::CastOps::UIToFP)) - { - // pshufb masks for each component - Value* vConstMask[2]; - - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) - { - // x/z shuffle mask - vConstMask[0] = C<char>({ - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - }); - } - - if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) - { - // y/w shuffle mask - vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, - 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); - } - - // init denormalize variables if needed - Instruction::CastOps fpCast; - Value* conversionFactor; - - switch (conversionType) - { - case CONVERT_NORMALIZED: - fpCast = Instruction::CastOps::UIToFP; - conversionFactor = VIMMED1((float)(1.0 / 65535.0)); - break; - case CONVERT_USCALED: - fpCast = Instruction::CastOps::UIToFP; - conversionFactor = VIMMED1((float)(1.0f)); - break; - case CONVERT_SSCALED: - SWR_INVALID("Type should not be zero extended!"); - conversionFactor = nullptr; - break; - default: - SWR_ASSERT(conversionType == CONVERT_NONE); - conversionFactor = nullptr; - break; - } - - // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits - for (uint32_t i = 0; i < 4; i++) - { - if (isComponentEnabled(compMask, i)) - { - if (compCtrl[i] == ComponentControl::StoreSrc) - { - // select correct constMask for x/z or y/w pshufb - uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - uint32_t selectedGather = (i < 2) ? 0 : 1; - - // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, - // for now.. - - Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0); - Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1); - - Value* temp_lo = BITCAST( - PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), - vGatherTy); - Value* temp_hi = BITCAST( - PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), - vGatherTy); - - // after pshufb mask for x channel; z uses the same shuffle from the second - // gather 256i - 0 1 2 3 4 5 6 7 - // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 - - Value* temp = JOIN_16(temp_lo, temp_hi); - - // denormalize if needed - if (conversionType != CONVERT_NONE) - { - temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor); - } - - vVertexElements[currentVertexElement] = temp; - - currentVertexElement += 1; - } - else - { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } - } - } - else - { - SWR_INVALID("Unsupported conversion type"); - } -} - -void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args) -{ - // Unpack tuple args - Value*(&vGatherResult)[2] = std::get<0>(args); - Value* pVtxOut = std::get<1>(args); - const Instruction::CastOps extendType = std::get<2>(args); - const ConversionType conversionType = std::get<3>(args); - uint32_t& currentVertexElement = std::get<4>(args); - uint32_t& outputElt = std::get<5>(args); - const ComponentEnable compMask = std::get<6>(args); - const ComponentControl(&compCtrl)[4] = std::get<7>(args); - Value*(&vVertexElements)[4] = std::get<8>(args); - - // cast types - Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - - // have to do extra work for sign extending - if ((extendType == Instruction::CastOps::SExt) || - (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) - { - // is this PP float? - bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; - - Type* v8x16Ty = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), - mVWidth / 4); // vwidth is units of 32 bits - - // shuffle mask - Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); - Value* vi128XY = nullptr; - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - Value* vShufResult = - BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - - vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - // after PERMD: move and pack xy components into each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy - } - - // do the same for zw components - Value* vi128ZW = nullptr; - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - Value* vShufResult = - BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); - vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - } - - // init denormalize variables if needed - Instruction::CastOps IntToFpCast; - Value* conversionFactor; - - switch (conversionType) - { - case CONVERT_NORMALIZED: - IntToFpCast = Instruction::CastOps::SIToFP; - conversionFactor = VIMMED1((float)(1.0 / 32767.0)); - break; - case CONVERT_SSCALED: - IntToFpCast = Instruction::CastOps::SIToFP; - conversionFactor = VIMMED1((float)(1.0)); - break; - case CONVERT_USCALED: - SWR_INVALID("Type should not be sign extended!"); - conversionFactor = nullptr; - break; - default: - SWR_ASSERT(conversionType == CONVERT_NONE); - conversionFactor = nullptr; - break; - } - - // sign extend all enabled components. If we have a fill vVertexElements, output to current - // simdvertex - for (uint32_t i = 0; i < 4; i++) - { - if (isComponentEnabled(compMask, i)) - { - if (compCtrl[i] == ComponentControl::StoreSrc) - { - // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 - uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - - if (bFP) - { - // extract 128 bit lanes to sign extend each component - vVertexElements[currentVertexElement] = - CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); - } - else - { - // extract 128 bit lanes to sign extend each component - vVertexElements[currentVertexElement] = - PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); - - // denormalize if needed - if (conversionType != CONVERT_NONE) - { - vVertexElements[currentVertexElement] = - FMUL(CAST(IntToFpCast, - vVertexElements[currentVertexElement], - mSimdFP32Ty), - conversionFactor); - } - } - currentVertexElement++; - } - else - { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } - } - } - // else zero extend - else if ((extendType == Instruction::CastOps::ZExt) || - (extendType == Instruction::CastOps::UIToFP)) - { - // pshufb masks for each component - Value* vConstMask[2]; - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) - { - // x/z shuffle mask - vConstMask[0] = C<char>({ - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - }); - } - - if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) - { - // y/w shuffle mask - vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, - 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); - } - - // init denormalize variables if needed - Instruction::CastOps fpCast; - Value* conversionFactor; - - switch (conversionType) - { - case CONVERT_NORMALIZED: - fpCast = Instruction::CastOps::UIToFP; - conversionFactor = VIMMED1((float)(1.0 / 65535.0)); - break; - case CONVERT_USCALED: - fpCast = Instruction::CastOps::UIToFP; - conversionFactor = VIMMED1((float)(1.0f)); - break; - case CONVERT_SSCALED: - SWR_INVALID("Type should not be zero extended!"); - conversionFactor = nullptr; - break; - default: - SWR_ASSERT(conversionType == CONVERT_NONE); - conversionFactor = nullptr; - break; - } - - // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits - for (uint32_t i = 0; i < 4; i++) - { - if (isComponentEnabled(compMask, i)) - { - if (compCtrl[i] == ComponentControl::StoreSrc) - { - // select correct constMask for x/z or y/w pshufb - uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - uint32_t selectedGather = (i < 2) ? 0 : 1; - - vVertexElements[currentVertexElement] = - BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), - vConstMask[selectedMask]), - vGatherTy); - // after pshufb mask for x channel; z uses the same shuffle from the second - // gather 256i - 0 1 2 3 4 5 6 7 - // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 - - // denormalize if needed - if (conversionType != CONVERT_NONE) - { - vVertexElements[currentVertexElement] = - FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), - conversionFactor); - } - currentVertexElement++; - } - else - { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } - } - } - else - { - SWR_INVALID("Unsupported conversion type"); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Output a simdvertex worth of elements to the current outputElt -/// @param pVtxOut - base address of VIN output struct -/// @param outputElt - simdvertex offset in VIN to write to -/// @param numEltsToStore - number of simdvertex rows to write out -/// @param vVertexElements - LLVM Value*[] simdvertex to write out -void FetchJit::StoreVertexElements(Value* pVtxOut, - const uint32_t outputElt, - const uint32_t numEltsToStore, - Value* (&vVertexElements)[4]) -{ - SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); - - for (uint32_t c = 0; c < numEltsToStore; ++c) - { - // STORE expects FP32 x vWidth type, just bitcast if needed - if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) - { -#if FETCH_DUMP_VERTEX - PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); -#endif - vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); - } -#if FETCH_DUMP_VERTEX - else - { - PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); - } -#endif - // outputElt * 4 = offsetting by the size of a simdvertex - // + c offsets to a 32bit x vWidth row within the current vertex - Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP"); - STORE(vVertexElements[c], dest); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Generates a constant vector of values based on the -/// ComponentControl value -/// @param ctrl - ComponentControl value -Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) -{ - switch (ctrl) - { - case NoStore: - return VUNDEF_I(); - case Store0: - return VIMMED1(0); - case Store1Fp: - return VIMMED1(1.0f); - case Store1Int: - return VIMMED1(1); - case StoreVertexId: - { - if (mVWidth == 16) - { - Type* pSimd8FPTy = getVectorType(mFP32Ty, 8); - Value* pIdLo = - BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy); - Value* pIdHi = - BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy); - return JOIN_16(pIdLo, pIdHi); - } - else - { - return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty); - } - } - case StoreInstanceId: - { - Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty); - return VBROADCAST(pId); - } - - - case StoreSrc: - default: - SWR_INVALID("Invalid component control"); - return VUNDEF_I(); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Returns the enable mask for the specified component. -/// @param enableMask - enable bits -/// @param component - component to check if enabled. -bool isComponentEnabled(ComponentEnable enableMask, uint8_t component) -{ - switch (component) - { - // X - case 0: - return (enableMask & ComponentEnable::X); - // Y - case 1: - return (enableMask & ComponentEnable::Y); - // Z - case 2: - return (enableMask & ComponentEnable::Z); - // W - case 3: - return (enableMask & ComponentEnable::W); - - default: - return false; - } -} - -// Don't want two threads compiling the same fetch shader simultaneously -// Has problems in the JIT cache implementation -// This is only a problem for fetch right now. -static std::mutex gFetchCodegenMutex; - -////////////////////////////////////////////////////////////////////////// -/// @brief JITs from fetch shader IR -/// @param hJitMgr - JitManager handle -/// @param func - LLVM function IR -/// @return PFN_FETCH_FUNC - pointer to fetch code -PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) -{ - const llvm::Function* func = (const llvm::Function*)hFunc; - JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); - PFN_FETCH_FUNC pfnFetch; - - gFetchCodegenMutex.lock(); - pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); - // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot - // add new IR to the module - pJitMgr->mIsModuleFinalized = true; - -#if defined(KNOB_SWRC_TRACING) - char fName[1024]; - const char* funcName = func->getName().data(); - sprintf(fName, "%s.bin", funcName); - FILE* fd = fopen(fName, "wb"); - fwrite((void*)pfnFetch, 1, 2048, fd); - fclose(fd); -#endif - - pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final"); - gFetchCodegenMutex.unlock(); - - - return pfnFetch; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief JIT compiles fetch shader -/// @param hJitMgr - JitManager handle -/// @param state - fetch state to build function from -extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state) -{ - JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); - - pJitMgr->SetupNewModule(); - - FetchJit theJit(pJitMgr); - HANDLE hFunc = theJit.Create(state); - - return JitFetchFunc(hJitMgr, hFunc); -} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h deleted file mode 100644 index 9c4c6672184..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h +++ /dev/null @@ -1,150 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file fetch_jit.h - * - * @brief Definition of the fetch jitter - * - * Notes: - * - ******************************************************************************/ -#pragma once - -#include "common/formats.h" -#include "core/state.h" - -////////////////////////////////////////////////////////////////////////// -/// INPUT_ELEMENT_DESC -////////////////////////////////////////////////////////////////////////// -struct INPUT_ELEMENT_DESC -{ - union - { - struct - { - uint32_t AlignedByteOffset : 12; - uint32_t Format : 10; - uint32_t StreamIndex : 6; - uint32_t InstanceEnable : 1; - uint32_t InstanceStrideEnable : 1; - uint32_t ComponentControl0 : 4; - uint32_t ComponentControl1 : 4; - uint32_t ComponentControl2 : 4; - uint32_t ComponentControl3 : 4; - uint32_t ComponentPacking : 4; - uint32_t _reserved : 14; - }; - uint64_t bits; - }; - uint32_t InstanceAdvancementState; -}; - -// used to set ComponentPacking -enum ComponentEnable -{ - NONE = 0x0, - X = 0x1, - Y = 0x2, - XY = 0x3, - Z = 0x4, - XZ = 0x5, - YZ = 0x6, - XYZ = 0x7, - W = 0x8, - XW = 0x9, - YW = 0xA, - XYW = 0xB, - ZW = 0xC, - XZW = 0xD, - YZW = 0xE, - XYZW = 0xF, -}; - -enum ComponentControl -{ - NoStore = 0, - StoreSrc = 1, - Store0 = 2, - Store1Fp = 3, - Store1Int = 4, - StoreVertexId = 5, - StoreInstanceId = 6, -}; - -////////////////////////////////////////////////////////////////////////// -/// State required for fetch shader jit compile. -////////////////////////////////////////////////////////////////////////// -struct FETCH_COMPILE_STATE -{ - uint32_t numAttribs{0}; - INPUT_ELEMENT_DESC layout[SWR_VTX_NUM_SLOTS]; - SWR_FORMAT indexType; - uint32_t cutIndex{0xffffffff}; - - // Options that effect the JIT'd code - bool bDisableIndexOOBCheck; // If enabled, FetchJit will exclude index OOB check - bool bEnableCutIndex{false}; // Compares indices with the cut index and returns a cut mask - bool bVertexIDOffsetEnable{false}; // Offset vertexID by StartVertex for non-indexed draws or - // BaseVertex for indexed draws - bool bPartialVertexBuffer{ - false}; // for indexed draws, map illegal indices to a known resident vertex - - bool bForceSequentialAccessEnable{false}; - bool bInstanceIDOffsetEnable{false}; - - FETCH_COMPILE_STATE(bool disableIndexOOBCheck = false) : - bDisableIndexOOBCheck(disableIndexOOBCheck){}; - - bool operator==(const FETCH_COMPILE_STATE& other) const - { - if (numAttribs != other.numAttribs) - return false; - if (indexType != other.indexType) - return false; - if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) - return false; - if (bEnableCutIndex != other.bEnableCutIndex) - return false; - if (cutIndex != other.cutIndex) - return false; - if (bVertexIDOffsetEnable != other.bVertexIDOffsetEnable) - return false; - if (bPartialVertexBuffer != other.bPartialVertexBuffer) - return false; - if (bForceSequentialAccessEnable != other.bForceSequentialAccessEnable) - return false; - if (bInstanceIDOffsetEnable != other.bInstanceIDOffsetEnable) - return false; - - for (uint32_t i = 0; i < numAttribs; ++i) - { - if ((layout[i].bits != other.layout[i].bits) || - (((layout[i].InstanceEnable == 1) || (layout[i].InstanceStrideEnable == 1)) && - (layout[i].InstanceAdvancementState != other.layout[i].InstanceAdvancementState))) - { - return false; - } - } - - return true; - } -}; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp deleted file mode 100644 index 61c6b57b38b..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp +++ /dev/null @@ -1,962 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file lower_x86.cpp - * - * @brief llvm pass to lower meta code to x86 - * - * Notes: - * - ******************************************************************************/ - -#include "jit_pch.hpp" -#include "passes.h" -#include "JitManager.h" - -#include "common/simdlib.hpp" - -#include <unordered_map> - -extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t); - -namespace llvm -{ - // forward declare the initializer - void initializeLowerX86Pass(PassRegistry&); -} // namespace llvm - -namespace SwrJit -{ - using namespace llvm; - - enum TargetArch - { - AVX = 0, - AVX2 = 1, - AVX512 = 2 - }; - - enum TargetWidth - { - W256 = 0, - W512 = 1, - NUM_WIDTHS = 2 - }; - - struct LowerX86; - - typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc; - - struct X86Intrinsic - { - IntrinsicID intrin[NUM_WIDTHS]; - EmuFunc emuFunc; - }; - - // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the - // previous behavior of mapping directly to avx/avx2 intrinsics. - using intrinsicMap_t = std::map<std::string, IntrinsicID>; - static intrinsicMap_t& getIntrinsicMap() { - static std::map<std::string, IntrinsicID> intrinsicMap = { - {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32}, - {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b}, - {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256}, - {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256}, - {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256}, - {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d}, - {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32}, - {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc} - }; - return intrinsicMap; - } - - // Forward decls - Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* - VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* - VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* - VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* - VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* - VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* - VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - - Instruction* DOUBLE_EMU(LowerX86* pThis, - TargetArch arch, - TargetWidth width, - CallInst* pCallInst, - Intrinsic::ID intrin); - - static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1; - - using intrinsicMapAdvanced_t = std::vector<std::map<std::string, X86Intrinsic>>; - - static intrinsicMapAdvanced_t& getIntrinsicMapAdvanced() - { - // clang-format off - static intrinsicMapAdvanced_t intrinsicMapAdvanced = { - // 256 wide 512 wide - { - // AVX - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, - }, - { - // AVX2 - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, - }, - { - // AVX512 - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}}, - #if LLVM_VERSION_MAJOR < 7 - {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}}, - {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}}, - #else - {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - #endif - {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, - #if LLVM_VERSION_MAJOR < 7 - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}}, - #else - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}}, - #endif - {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}} - }}; - // clang-format on - return intrinsicMapAdvanced; - } - - static uint32_t getBitWidth(VectorType *pVTy) - { -#if LLVM_VERSION_MAJOR >= 12 - return cast<FixedVectorType>(pVTy)->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits(); -#elif LLVM_VERSION_MAJOR >= 11 - return pVTy->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits(); -#else - return pVTy->getBitWidth(); -#endif - } - - struct LowerX86 : public FunctionPass - { - LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b) - { - initializeLowerX86Pass(*PassRegistry::getPassRegistry()); - - // Determine target arch - if (JM()->mArch.AVX512F()) - { - mTarget = AVX512; - } - else if (JM()->mArch.AVX2()) - { - mTarget = AVX2; - } - else if (JM()->mArch.AVX()) - { - mTarget = AVX; - } - else - { - SWR_ASSERT(false, "Unsupported AVX architecture."); - mTarget = AVX; - } - - // Setup scatter function for 256 wide - uint32_t curWidth = B->mVWidth; - B->SetTargetWidth(8); - std::vector<Type*> args = { - B->mInt8PtrTy, // pBase - B->mSimdInt32Ty, // vIndices - B->mSimdFP32Ty, // vSrc - B->mInt8Ty, // mask - B->mInt32Ty // scale - }; - - FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false); - mPfnScatter256 = cast<Function>( -#if LLVM_VERSION_MAJOR >= 9 - B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee()); -#else - B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy)); -#endif - if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr) - { - sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256); - } - - B->SetTargetWidth(curWidth); - } - - // Try to decipher the vector type of the instruction. This does not work properly - // across all intrinsics, and will have to be rethought. Probably need something - // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed - // intrinsic. - void GetRequestedWidthAndType(CallInst* pCallInst, - const StringRef intrinName, - TargetWidth* pWidth, - Type** pTy) - { - assert(pCallInst); - Type* pVecTy = pCallInst->getType(); - - // Check for intrinsic specific types - // VCVTPD2PS type comes from src, not dst - if (intrinName.equals("meta.intrinsic.VCVTPD2PS")) - { - Value* pOp = pCallInst->getOperand(0); - assert(pOp); - pVecTy = pOp->getType(); - } - - if (!pVecTy->isVectorTy()) - { - for (auto& op : pCallInst->arg_operands()) - { - if (op.get()->getType()->isVectorTy()) - { - pVecTy = op.get()->getType(); - break; - } - } - } - SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size"); - - uint32_t width = getBitWidth(cast<VectorType>(pVecTy)); - switch (width) - { - case 256: - *pWidth = W256; - break; - case 512: - *pWidth = W512; - break; - default: - SWR_ASSERT(false, "Unhandled vector width %d", width); - *pWidth = W256; - } - - *pTy = pVecTy->getScalarType(); - } - - Value* GetZeroVec(TargetWidth width, Type* pTy) - { - uint32_t numElem = 0; - switch (width) - { - case W256: - numElem = 8; - break; - case W512: - numElem = 16; - break; - default: - SWR_ASSERT(false, "Unhandled vector width type %d\n", width); - } - - return ConstantVector::getNullValue(getVectorType(pTy, numElem)); - } - - Value* GetMask(TargetWidth width) - { - Value* mask; - switch (width) - { - case W256: - mask = B->C((uint8_t)-1); - break; - case W512: - mask = B->C((uint16_t)-1); - break; - default: - SWR_ASSERT(false, "Unhandled vector width type %d\n", width); - } - return mask; - } - - // Convert <N x i1> mask to <N x i32> x86 mask - Value* VectorMask(Value* vi1Mask) - { -#if LLVM_VERSION_MAJOR >= 12 - uint32_t numElem = cast<FixedVectorType>(vi1Mask->getType())->getNumElements(); -#elif LLVM_VERSION_MAJOR >= 11 - uint32_t numElem = cast<VectorType>(vi1Mask->getType())->getNumElements(); -#else - uint32_t numElem = vi1Mask->getType()->getVectorNumElements(); -#endif - return B->S_EXT(vi1Mask, getVectorType(B->mInt32Ty, numElem)); - } - - Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst) - { - Function* pFunc = pCallInst->getCalledFunction(); - assert(pFunc); - - auto& intrinsic = getIntrinsicMapAdvanced()[mTarget][pFunc->getName().str()]; - TargetWidth vecWidth; - Type* pElemTy; - GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy); - - // Check if there is a native intrinsic for this instruction - IntrinsicID id = intrinsic.intrin[vecWidth]; - if (id == DOUBLE) - { - // Double pump the next smaller SIMD intrinsic - SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width."); - Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1]; - SWR_ASSERT(id2 != Intrinsic::not_intrinsic, - "Cannot find intrinsic to double pump."); - return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2); - } - else if (id != Intrinsic::not_intrinsic) - { - Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id); - SmallVector<Value*, 8> args; - for (auto& arg : pCallInst->arg_operands()) - { - args.push_back(arg.get()); - } - - // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and - // full mask for now Assuming the intrinsics are consistent and place the src - // operand and mask last in the argument list. - if (mTarget == AVX512) - { - if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS")) - { - args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType())); - args.push_back(GetMask(W256)); - // for AVX512 VCVTPD2PS, we also have to add rounding mode - args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); - } - else - { - args.push_back(GetZeroVec(vecWidth, pElemTy)); - args.push_back(GetMask(vecWidth)); - } - } - - return B->CALLA(pIntrin, args); - } - else - { - // No native intrinsic, call emulation function - return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst); - } - - SWR_ASSERT(false); - return nullptr; - } - - Instruction* ProcessIntrinsic(CallInst* pCallInst) - { - Function* pFunc = pCallInst->getCalledFunction(); - assert(pFunc); - - // Forward to the advanced support if found - if (getIntrinsicMapAdvanced()[mTarget].find(pFunc->getName().str()) != getIntrinsicMapAdvanced()[mTarget].end()) - { - return ProcessIntrinsicAdvanced(pCallInst); - } - - SWR_ASSERT(getIntrinsicMap().find(pFunc->getName().str()) != getIntrinsicMap().end(), - "Unimplemented intrinsic %s.", - pFunc->getName().str().c_str()); - - Intrinsic::ID x86Intrinsic = getIntrinsicMap()[pFunc->getName().str()]; - Function* pX86IntrinFunc = - Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic); - - SmallVector<Value*, 8> args; - for (auto& arg : pCallInst->arg_operands()) - { - args.push_back(arg.get()); - } - return B->CALLA(pX86IntrinFunc, args); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief LLVM function pass run method. - /// @param f- The function we're working on with this pass. - virtual bool runOnFunction(Function& F) - { - std::vector<Instruction*> toRemove; - std::vector<BasicBlock*> bbs; - - // Make temp copy of the basic blocks and instructions, as the intrinsic - // replacement code might invalidate the iterators - for (auto& b : F.getBasicBlockList()) - { - bbs.push_back(&b); - } - - for (auto* BB : bbs) - { - std::vector<Instruction*> insts; - for (auto& i : BB->getInstList()) - { - insts.push_back(&i); - } - - for (auto* I : insts) - { - if (CallInst* pCallInst = dyn_cast<CallInst>(I)) - { - Function* pFunc = pCallInst->getCalledFunction(); - if (pFunc) - { - if (pFunc->getName().startswith("meta.intrinsic")) - { - B->IRB()->SetInsertPoint(I); - Instruction* pReplace = ProcessIntrinsic(pCallInst); - toRemove.push_back(pCallInst); - if (pReplace) - { - pCallInst->replaceAllUsesWith(pReplace); - } - } - } - } - } - } - - for (auto* pInst : toRemove) - { - pInst->eraseFromParent(); - } - - JitManager::DumpToFile(&F, "lowerx86"); - - return true; - } - - virtual void getAnalysisUsage(AnalysisUsage& AU) const {} - - JitManager* JM() { return B->JM(); } - Builder* B; - TargetArch mTarget; - Function* mPfnScatter256; - - static char ID; ///< Needed by LLVM to generate ID for FunctionPass. - }; - - char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID. - - FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); } - - Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) - { - SWR_ASSERT(false, "Unimplemented intrinsic emulation."); - return nullptr; - } - - Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) - { - // Only need vperm emulation for AVX - SWR_ASSERT(arch == AVX); - - Builder* B = pThis->B; - auto v32A = pCallInst->getArgOperand(0); - auto vi32Index = pCallInst->getArgOperand(1); - - Value* v32Result; - if (isa<Constant>(vi32Index)) - { - // Can use llvm shuffle vector directly with constant shuffle indices - v32Result = B->VSHUFFLE(v32A, v32A, vi32Index); - } - else - { - v32Result = UndefValue::get(v32A->getType()); -#if LLVM_VERSION_MAJOR >= 12 - uint32_t numElem = cast<FixedVectorType>(v32A->getType())->getNumElements(); -#elif LLVM_VERSION_MAJOR >= 11 - uint32_t numElem = cast<VectorType>(v32A->getType())->getNumElements(); -#else - uint32_t numElem = v32A->getType()->getVectorNumElements(); -#endif - for (uint32_t l = 0; l < numElem; ++l) - { - auto i32Index = B->VEXTRACT(vi32Index, B->C(l)); - auto val = B->VEXTRACT(v32A, i32Index); - v32Result = B->VINSERT(v32Result, val, B->C(l)); - } - } - return cast<Instruction>(v32Result); - } - - Instruction* - VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) - { - Builder* B = pThis->B; - auto vSrc = pCallInst->getArgOperand(0); - auto pBase = pCallInst->getArgOperand(1); - auto vi32Indices = pCallInst->getArgOperand(2); - auto vi1Mask = pCallInst->getArgOperand(3); - auto i8Scale = pCallInst->getArgOperand(4); - - pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0)); -#if LLVM_VERSION_MAJOR >= 11 -#if LLVM_VERSION_MAJOR >= 12 - FixedVectorType* pVectorType = cast<FixedVectorType>(vSrc->getType()); -#else - VectorType* pVectorType = cast<VectorType>(vSrc->getType()); -#endif - uint32_t numElem = pVectorType->getNumElements(); - auto srcTy = pVectorType->getElementType(); -#else - uint32_t numElem = vSrc->getType()->getVectorNumElements(); - auto srcTy = vSrc->getType()->getVectorElementType(); -#endif - auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); - - Value* v32Gather = nullptr; - if (arch == AVX) - { - // Full emulation for AVX - // Store source on stack to provide a valid address to load from inactive lanes - auto pStack = B->STACKSAVE(); - auto pTmp = B->ALLOCA(vSrc->getType()); - B->STORE(vSrc, pTmp); - - v32Gather = UndefValue::get(vSrc->getType()); -#if LLVM_VERSION_MAJOR <= 10 - auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale)); -#elif LLVM_VERSION_MAJOR == 11 - auto vi32Scale = ConstantVector::getSplat(ElementCount(numElem, false), cast<ConstantInt>(i32Scale)); -#else - auto vi32Scale = ConstantVector::getSplat(ElementCount::get(numElem, false), cast<ConstantInt>(i32Scale)); -#endif - auto vi32Offsets = B->MUL(vi32Indices, vi32Scale); - - for (uint32_t i = 0; i < numElem; ++i) - { - auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i)); - auto pLoadAddress = B->GEP(pBase, i32Offset); - pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0)); - auto pMaskedLoadAddress = B->GEP(pTmp, {0, i}); - auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i)); - auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress); - auto val = B->LOAD(pValidAddress); - v32Gather = B->VINSERT(v32Gather, val, B->C(i)); - } - - B->STACKRESTORE(pStack); - } - else if (arch == AVX2 || (arch == AVX512 && width == W256)) - { - Function* pX86IntrinFunc = nullptr; - if (srcTy == B->mFP32Ty) - { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, - Intrinsic::x86_avx2_gather_d_ps_256); - } - else if (srcTy == B->mInt32Ty) - { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, - Intrinsic::x86_avx2_gather_d_d_256); - } - else if (srcTy == B->mDoubleTy) - { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, - Intrinsic::x86_avx2_gather_d_q_256); - } - else - { - SWR_ASSERT(false, "Unsupported vector element type for gather."); - } - - if (width == W256) - { - auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType()); - v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale}); - } - else if (width == W512) - { - // Double pump 4-wide for 64bit elements -#if LLVM_VERSION_MAJOR >= 12 - if (cast<FixedVectorType>(vSrc->getType())->getElementType() == B->mDoubleTy) -#elif LLVM_VERSION_MAJOR >= 11 - if (cast<VectorType>(vSrc->getType())->getElementType() == B->mDoubleTy) -#else - if (vSrc->getType()->getVectorElementType() == B->mDoubleTy) -#endif - { - auto v64Mask = pThis->VectorMask(vi1Mask); -#if LLVM_VERSION_MAJOR >= 12 - uint32_t numElem = cast<FixedVectorType>(v64Mask->getType())->getNumElements(); -#elif LLVM_VERSION_MAJOR >= 11 - uint32_t numElem = cast<VectorType>(v64Mask->getType())->getNumElements(); -#else - uint32_t numElem = v64Mask->getType()->getVectorNumElements(); -#endif - v64Mask = B->S_EXT(v64Mask, getVectorType(B->mInt64Ty, numElem)); - v64Mask = B->BITCAST(v64Mask, vSrc->getType()); - - Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3})); - Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7})); - - Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3})); - Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7})); - - Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3})); - Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7})); - -#if LLVM_VERSION_MAJOR >= 12 - uint32_t numElemSrc0 = cast<FixedVectorType>(src0->getType())->getNumElements(); - uint32_t numElemMask0 = cast<FixedVectorType>(mask0->getType())->getNumElements(); - uint32_t numElemSrc1 = cast<FixedVectorType>(src1->getType())->getNumElements(); - uint32_t numElemMask1 = cast<FixedVectorType>(mask1->getType())->getNumElements(); -#elif LLVM_VERSION_MAJOR >= 11 - uint32_t numElemSrc0 = cast<VectorType>(src0->getType())->getNumElements(); - uint32_t numElemMask0 = cast<VectorType>(mask0->getType())->getNumElements(); - uint32_t numElemSrc1 = cast<VectorType>(src1->getType())->getNumElements(); - uint32_t numElemMask1 = cast<VectorType>(mask1->getType())->getNumElements(); -#else - uint32_t numElemSrc0 = src0->getType()->getVectorNumElements(); - uint32_t numElemMask0 = mask0->getType()->getVectorNumElements(); - uint32_t numElemSrc1 = src1->getType()->getVectorNumElements(); - uint32_t numElemMask1 = mask1->getType()->getVectorNumElements(); -#endif - src0 = B->BITCAST(src0, getVectorType(B->mInt64Ty, numElemSrc0)); - mask0 = B->BITCAST(mask0, getVectorType(B->mInt64Ty, numElemMask0)); - Value* gather0 = - B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale}); - src1 = B->BITCAST(src1, getVectorType(B->mInt64Ty, numElemSrc1)); - mask1 = B->BITCAST(mask1, getVectorType(B->mInt64Ty, numElemMask1)); - Value* gather1 = - B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale}); - v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7})); - v32Gather = B->BITCAST(v32Gather, vSrc->getType()); - } - else - { - // Double pump 8-wide for 32bit elements - auto v32Mask = pThis->VectorMask(vi1Mask); - v32Mask = B->BITCAST(v32Mask, vSrc->getType()); - Value* src0 = B->EXTRACT_16(vSrc, 0); - Value* src1 = B->EXTRACT_16(vSrc, 1); - - Value* indices0 = B->EXTRACT_16(vi32Indices, 0); - Value* indices1 = B->EXTRACT_16(vi32Indices, 1); - - Value* mask0 = B->EXTRACT_16(v32Mask, 0); - Value* mask1 = B->EXTRACT_16(v32Mask, 1); - - Value* gather0 = - B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale}); - Value* gather1 = - B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale}); - - v32Gather = B->JOIN_16(gather0, gather1); - } - } - } - else if (arch == AVX512) - { - Value* iMask = nullptr; - Function* pX86IntrinFunc = nullptr; - if (srcTy == B->mFP32Ty) - { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, - Intrinsic::x86_avx512_gather_dps_512); - iMask = B->BITCAST(vi1Mask, B->mInt16Ty); - } - else if (srcTy == B->mInt32Ty) - { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, - Intrinsic::x86_avx512_gather_dpi_512); - iMask = B->BITCAST(vi1Mask, B->mInt16Ty); - } - else if (srcTy == B->mDoubleTy) - { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, - Intrinsic::x86_avx512_gather_dpd_512); - iMask = B->BITCAST(vi1Mask, B->mInt8Ty); - } - else - { - SWR_ASSERT(false, "Unsupported vector element type for gather."); - } - - auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); - v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale}); - } - - return cast<Instruction>(v32Gather); - } - Instruction* - VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) - { - Builder* B = pThis->B; - auto pBase = pCallInst->getArgOperand(0); - auto vi1Mask = pCallInst->getArgOperand(1); - auto vi32Indices = pCallInst->getArgOperand(2); - auto v32Src = pCallInst->getArgOperand(3); - auto i32Scale = pCallInst->getArgOperand(4); - - if (arch != AVX512) - { - // Call into C function to do the scatter. This has significantly better compile perf - // compared to jitting scatter loops for every scatter - if (width == W256) - { - auto mask = B->BITCAST(vi1Mask, B->mInt8Ty); - B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale}); - } - else - { - // Need to break up 512 wide scatter to two 256 wide - auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7})); - auto indicesLo = - B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7})); - auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7})); - - auto mask = B->BITCAST(maskLo, B->mInt8Ty); - B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale}); - - auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15})); - auto indicesHi = - B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15})); - auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15})); - - mask = B->BITCAST(maskHi, B->mInt8Ty); - B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale}); - } - return nullptr; - } - - Value* iMask; - Function* pX86IntrinFunc; - if (width == W256) - { - // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we - // can use the scatter of 8 elements with 64bit indices - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, - Intrinsic::x86_avx512_scatter_qps_512); - - auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty); - iMask = B->BITCAST(vi1Mask, B->mInt8Ty); - B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale}); - } - else if (width == W512) - { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, - Intrinsic::x86_avx512_scatter_dps_512); - iMask = B->BITCAST(vi1Mask, B->mInt16Ty); - B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale}); - } - return nullptr; - } - - // No support for vroundps in avx512 (it is available in kncni), so emulate with avx - // instructions - Instruction* - VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) - { - SWR_ASSERT(arch == AVX512); - - auto B = pThis->B; - auto vf32Src = pCallInst->getOperand(0); - assert(vf32Src); - auto i8Round = pCallInst->getOperand(1); - assert(i8Round); - auto pfnFunc = - Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256); - - if (width == W256) - { - return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round)); - } - else if (width == W512) - { - auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0); - auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1); - - auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round); - auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round); - - return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi)); - } - else - { - SWR_ASSERT(false, "Unimplemented vector width."); - } - - return nullptr; - } - - Instruction* - VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) - { - SWR_ASSERT(arch == AVX512); - - auto B = pThis->B; - auto vf32Src = pCallInst->getOperand(0); - - if (width == W256) - { - auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, - Intrinsic::x86_avx_round_ps_256); - return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty)); - } - else if (width == W512) - { - // 512 can use intrinsic - auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, - Intrinsic::x86_avx512_mask_cvtpd2ps_512); - return cast<Instruction>(B->CALL(pfnFunc, vf32Src)); - } - else - { - SWR_ASSERT(false, "Unimplemented vector width."); - } - - return nullptr; - } - - // No support for hsub in AVX512 - Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) - { - SWR_ASSERT(arch == AVX512); - - auto B = pThis->B; - auto src0 = pCallInst->getOperand(0); - auto src1 = pCallInst->getOperand(1); - - // 256b hsub can just use avx intrinsic - if (width == W256) - { - auto pX86IntrinFunc = - Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256); - return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1)); - } - else if (width == W512) - { - // 512b hsub can be accomplished with shuf/sub combo - auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14})); - auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15})); - return cast<Instruction>(B->SUB(minuend, subtrahend)); - } - else - { - SWR_ASSERT(false, "Unimplemented vector width."); - return nullptr; - } - } - - // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from - // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide - Instruction* DOUBLE_EMU(LowerX86* pThis, - TargetArch arch, - TargetWidth width, - CallInst* pCallInst, - Intrinsic::ID intrin) - { - auto B = pThis->B; - SWR_ASSERT(width == W512); - Value* result[2]; - Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin); - for (uint32_t i = 0; i < 2; ++i) - { - SmallVector<Value*, 8> args; - for (auto& arg : pCallInst->arg_operands()) - { - auto argType = arg.get()->getType(); - if (argType->isVectorTy()) - { -#if LLVM_VERSION_MAJOR >= 12 - uint32_t vecWidth = cast<FixedVectorType>(argType)->getNumElements(); - auto elemTy = cast<FixedVectorType>(argType)->getElementType(); -#elif LLVM_VERSION_MAJOR >= 11 - uint32_t vecWidth = cast<VectorType>(argType)->getNumElements(); - auto elemTy = cast<VectorType>(argType)->getElementType(); -#else - uint32_t vecWidth = argType->getVectorNumElements(); - auto elemTy = argType->getVectorElementType(); -#endif - Value* lanes = B->CInc<int>(i * vecWidth / 2, vecWidth / 2); - Value* argToPush = B->VSHUFFLE(arg.get(), B->VUNDEF(elemTy, vecWidth), lanes); - args.push_back(argToPush); - } - else - { - args.push_back(arg.get()); - } - } - result[i] = B->CALLA(pX86IntrinFunc, args); - } - uint32_t vecWidth; - if (result[0]->getType()->isVectorTy()) - { - assert(result[1]->getType()->isVectorTy()); -#if LLVM_VERSION_MAJOR >= 12 - vecWidth = cast<FixedVectorType>(result[0]->getType())->getNumElements() + - cast<FixedVectorType>(result[1]->getType())->getNumElements(); -#elif LLVM_VERSION_MAJOR >= 11 - vecWidth = cast<VectorType>(result[0]->getType())->getNumElements() + - cast<VectorType>(result[1]->getType())->getNumElements(); -#else - vecWidth = result[0]->getType()->getVectorNumElements() + - result[1]->getType()->getVectorNumElements(); -#endif - } - else - { - vecWidth = 2; - } - Value* lanes = B->CInc<int>(0, vecWidth); - return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes)); - } - -} // namespace SwrJit - -using namespace SwrJit; - -INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false) -INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h deleted file mode 100644 index e0bb75cdec9..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h +++ /dev/null @@ -1,38 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file passes.h - * - * @brief Include file for llvm passes - * - ******************************************************************************/ -#pragma once - -#include "JitManager.h" -#include "builder.h" - -namespace SwrJit -{ - using namespace llvm; - - FunctionPass* createLowerX86Pass(Builder* b); -} // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h deleted file mode 100644 index dcb051c3b53..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h +++ /dev/null @@ -1,113 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file jit_api.h - * - * @brief Platform independent JIT interface - * - * Notes: - * - ******************************************************************************/ -#pragma once -#include "common/os.h" -#include "core/utils.h" - -#include "fetch_jit.h" -#include "streamout_jit.h" -#include "blend_jit.h" - -#include <stdlib.h> - -#if defined(_WIN32) -#define EXCEPTION_PRINT_STACK(ret) ret -#endif // _WIN32 - -#if defined(_WIN32) -#define JITCALL __stdcall -#else -#define JITCALL -#endif - - -struct ShaderInfo; - -////////////////////////////////////////////////////////////////////////// -/// Jit Compile Info Input -////////////////////////////////////////////////////////////////////////// -struct JIT_COMPILE_INPUT -{ - SWR_SHADER_TYPE type; - uint32_t crc; - - const void* pIR; ///< Pointer to LLVM IR text. - size_t irLength; - - bool enableJitSampler; - -}; - - -extern "C" { - -////////////////////////////////////////////////////////////////////////// -/// @brief Create JIT context. -HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch, const char* core); - -////////////////////////////////////////////////////////////////////////// -/// @brief Destroy JIT context. -void JITCALL JitDestroyContext(HANDLE hJitContext); - -////////////////////////////////////////////////////////////////////////// -/// @brief JIT compile shader. -/// @param hJitContext - Jit Context -/// @param input - Input containing LLVM IR and other information -/// @param output - Output containing information about JIT shader -ShaderInfo* JITCALL JitCompileShader(HANDLE hJitContext, const JIT_COMPILE_INPUT& input); - -ShaderInfo* JITCALL JitGetShader(HANDLE hJitContext, const char* name); - -////////////////////////////////////////////////////////////////////////// -/// @brief JIT destroy shader. -/// @param hJitContext - Jit Context -/// @param pShaderInfo - pointer to shader object. -void JITCALL JitDestroyShader(HANDLE hJitContext, ShaderInfo*& pShaderInfo); - -////////////////////////////////////////////////////////////////////////// -/// @brief JIT compiles fetch shader -/// @param hJitContext - Jit Context -/// @param state - Fetch state to build function from -PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitContext, const FETCH_COMPILE_STATE& state); - -////////////////////////////////////////////////////////////////////////// -/// @brief JIT compiles streamout shader -/// @param hJitContext - Jit Context -/// @param state - SO state to build function from -PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitContext, const STREAMOUT_COMPILE_STATE& state); - -////////////////////////////////////////////////////////////////////////// -/// @brief JIT compiles blend shader -/// @param hJitContext - Jit Context -/// @param state - blend state to build function from -PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitContext, const BLEND_COMPILE_STATE& state); - -} - diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp b/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp deleted file mode 100644 index e54e23fc904..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp +++ /dev/null @@ -1,183 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2017-2020 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file jit_pch.hpp - * - * @brief Pre-compiled header for jitter - * - * Notes: - * - ******************************************************************************/ - -#pragma once - -#if defined(_MSC_VER) -#pragma warning(disable : 4146 4244 4267 4800 4996) -#endif - -#include <llvm/Config/llvm-config.h> - -#if LLVM_VERSION_MAJOR < 7 -// llvm 3.7+ reuses "DEBUG" as an enum value -#pragma push_macro("DEBUG") -#undef DEBUG -#endif - -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/IntrinsicInst.h" -#if LLVM_VERSION_MAJOR >= 10 -#include "llvm/IR/IntrinsicsX86.h" -#endif -#include "llvm/ExecutionEngine/ObjectCache.h" - -#include "llvm/IR/Verifier.h" -#include "llvm/ExecutionEngine/MCJIT.h" -#include "llvm/Support/FileSystem.h" -#define LLVM_F_NONE sys::fs::F_None - -#include "llvm/Analysis/Passes.h" - -#include "llvm/IR/LegacyPassManager.h" -using FunctionPassManager = llvm::legacy::FunctionPassManager; -using PassManager = llvm::legacy::PassManager; - -#include "llvm/CodeGen/Passes.h" -#include "llvm/ExecutionEngine/ExecutionEngine.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Support/DynamicLibrary.h" -#include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/Scalar.h" -#if LLVM_VERSION_MAJOR >= 7 -#include "llvm/Transforms/Utils.h" -#include "llvm/Transforms/InstCombine/InstCombine.h" -#endif -#include "llvm/Support/Host.h" -#include "llvm/Support/DynamicLibrary.h" - -#include "llvm/IR/DIBuilder.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/IR/Instructions.h" -#include "llvm/Pass.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SCCIterator.h" -#include "llvm/IR/Dominators.h" -#include "llvm/Analysis/PostDominators.h" -#include "llvm/Analysis/LoopInfo.h" - -#include "llvm/Transforms/Utils/Cloning.h" - -#if defined(_WIN32) -#include "llvm/ADT/Triple.h" -#endif -#include "llvm/IR/Function.h" - -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/SourceMgr.h" - -#include "llvm/Analysis/CFGPrinter.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Support/FormattedStream.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Config/llvm-config.h" - -#include "llvm/Bitcode/BitcodeWriter.h" -#include "llvm/Bitcode/BitcodeReader.h" - -#if LLVM_USE_INTEL_JITEVENTS -#include "llvm/ExecutionEngine/JITEventListener.h" -#endif - -#if LLVM_VERSION_MAJOR >= 5 -static const auto Sync_CrossThread = llvm::SyncScope::System; -static const auto Attrib_FunctionIndex = llvm::AttributeList::FunctionIndex; -static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, - const llvm::AttrBuilder& b) -{ - return llvm::AttributeSet::get(ctx, b); -} -#else -static const auto Sync_CrossThread = llvm::SynchronizationScope::CrossThread; -static const auto Attrib_FunctionIndex = llvm::AttributeSet::FunctionIndex; -static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, - const llvm::AttrBuilder& b) -{ - return llvm::AttributeSet::get(ctx, Attrib_FunctionIndex, b); -} -#endif - -#if LLVM_VERSION_MAJOR >= 11 -static inline llvm::VectorType* getVectorType(llvm::Type *ElementType, unsigned NumElements) -{ - return llvm::VectorType::get(ElementType, NumElements, false); -} -#else -static inline llvm::VectorType* getVectorType(llvm::Type *ElementType, unsigned NumElements) -{ - return llvm::VectorType::get(ElementType, NumElements); -} -#endif - -#if LLVM_VERSION_MAJOR < 7 -#pragma pop_macro("DEBUG") -#endif - -#if LLVM_VERSION_MAJOR > 10 - typedef unsigned IntrinsicID; - typedef llvm::Align AlignType; -#else - typedef llvm::Intrinsic::ID IntrinsicID; - typedef unsigned AlignType; -#endif - -#include <deque> -#include <list> -#include <unordered_map> -#include <unordered_set> -#include <iostream> -#include <sstream> -#include <type_traits> -#include <cstdint> -#include <vector> -#include <tuple> -#include <mutex> - -#include "common/os.h" - -#if defined(_WIN32) -#define JIT_OBJ_EXT ".obj" -#else -#define JIT_OBJ_EXT ".o" -#endif // _WIN32 diff --git a/src/gallium/drivers/swr/rasterizer/jitter/meson.build b/src/gallium/drivers/swr/rasterizer/jitter/meson.build deleted file mode 100644 index 295dc2fccb5..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/meson.build +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright © 2017-2018 Intel Corporation - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -if dep_llvm.type_name() == 'internal' - _irbuilder_h = subproject('llvm').get_variable('irbuilder_h') -else - _llvm_includedir = dep_llvm.get_variable(configtool : 'includedir', cmake : 'LLVM_INCLUDE_DIR') - _irbuilder_h = join_paths(_llvm_includedir, 'llvm', 'IR', 'IRBuilder.h') -endif - -gen_builder_hpp = custom_target( - 'gen_builder.hpp', - input : [ - swr_gen_llvm_ir_macros_py, _irbuilder_h, - ], - output : 'gen_builder.hpp', - command : [ - prog_python, '@INPUT0@', '--input', '@INPUT1@', '--output', '@OUTPUT@', - '--gen_h', '--output-dir', '@OUTDIR@' - ], - depend_files : swr_gen_builder_depends, - build_by_default : true, -) - -gen_builder_meta_hpp = custom_target( - 'gen_builder_meta.hpp', - input : '../codegen/gen_llvm_ir_macros.py', - output : 'gen_builder_meta.hpp', - command : [ - prog_python, '@INPUT0@', '--gen_meta_h', '--output', '@OUTPUT@', - '--output-dir', '@OUTDIR@' - ], - depend_files : swr_gen_builder_depends, -) - -gen_builder_intrin_hpp = custom_target( - 'gen_builder_intrin.hpp', - input : '../codegen/gen_llvm_ir_macros.py', - output : 'gen_builder_intrin.hpp', - command : [ - prog_python, '@INPUT0@', '--gen_intrin_h', '--output', '@OUTPUT@', - '--output-dir', '@OUTDIR@' - ], - depend_files : swr_gen_builder_depends, -) - diff --git a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp deleted file mode 100644 index 1c9db0c2d2a..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file DebugOutput.cpp - * - * @brief Shader support library implementation for printed Debug output - * - * Notes: - * - ******************************************************************************/ -#include <stdarg.h> -#include "common/os.h" - -////////////////////////////////////////////////////////////////////////// -/// @brief called in JIT code, inserted by PRINT -/// output to both stdout and visual studio debug console -extern "C" void CallPrint(const char* fmt, ...) -{ - va_list args; - va_start(args, fmt); - vprintf(fmt, args); - -#if defined(_WIN32) - char strBuf[1024]; - vsnprintf_s(strBuf, _TRUNCATE, fmt, args); - OutputDebugStringA(strBuf); -#endif - - va_end(args); -} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp deleted file mode 100644 index 925d57f5d47..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file Scatter.cpp - * - * @brief Shader support library implementation for scatter emulation - * - * Notes: - * - ******************************************************************************/ -#include <stdarg.h> -#include "common/os.h" -#include "common/simdlib.hpp" - -extern "C" void ScatterPS_256(uint8_t* pBase, SIMD256::Integer vIndices, SIMD256::Float vSrc, uint8_t mask, uint32_t scale) -{ - OSALIGN(float, 32) src[8]; - OSALIGN(uint32_t, 32) indices[8]; - - SIMD256::store_ps(src, vSrc); - SIMD256::store_si((SIMD256::Integer*)indices, vIndices); - - unsigned long index; - while (_BitScanForward(&index, mask)) - { - mask &= ~(1 << index); - - *(float*)(pBase + indices[index] * scale) = src[index]; - } -} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp deleted file mode 100644 index 72e1261a4b3..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +++ /dev/null @@ -1,379 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file streamout_jit.cpp - * - * @brief Implementation of the streamout jitter - * - * Notes: - * - ******************************************************************************/ -#include "jit_pch.hpp" -#include "builder_gfx_mem.h" -#include "jit_api.h" -#include "streamout_jit.h" -#include "gen_state_llvm.h" -#include "functionpasses/passes.h" - -using namespace llvm; -using namespace SwrJit; - -////////////////////////////////////////////////////////////////////////// -/// Interface to Jitting a fetch shader -////////////////////////////////////////////////////////////////////////// -struct StreamOutJit : public BuilderGfxMem -{ - StreamOutJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr){}; - - // returns pointer to SWR_STREAMOUT_BUFFER - Value* getSOBuffer(Value* pSoCtx, uint32_t buffer) - { - return LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer}); - } - - ////////////////////////////////////////////////////////////////////////// - // @brief checks if streamout buffer is oob - // @return <i1> true/false - Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer) - { - Value* returnMask = C(false); - - Value* pBuf = getSOBuffer(pSoCtx, buffer); - - // load enable - // @todo bool data types should generate <i1> llvm type - Value* enabled = TRUNC(LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_enable}), IRB()->getInt1Ty()); - - // load buffer size - Value* bufferSize = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_bufferSize}); - - // load current streamOffset - Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset}); - - // load buffer pitch - Value* pitch = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch}); - - // buffer is considered oob if in use in a decl but not enabled - returnMask = OR(returnMask, NOT(enabled)); - - // buffer is oob if cannot fit a prims worth of verts - Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim))); - returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize)); - - return returnMask; - } - - ////////////////////////////////////////////////////////////////////////// - // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector, - // packing the active mask bits - // ex. bitmask 0011 -> (0, 1, 0, 0) - // bitmask 1000 -> (3, 0, 0, 0) - // bitmask 1100 -> (2, 3, 0, 0) - Value* PackMask(uint32_t bitmask) - { - std::vector<Constant*> indices(4, C(0)); - unsigned long index; - uint32_t elem = 0; - while (_BitScanForward(&index, bitmask)) - { - indices[elem++] = C((int)index); - bitmask &= ~(1 << index); - } - - return ConstantVector::get(indices); - } - - ////////////////////////////////////////////////////////////////////////// - // @brief convert scalar bitmask to <4xfloat> bitmask - Value* ToMask(uint32_t bitmask) - { - std::vector<Constant*> indices; - for (uint32_t i = 0; i < 4; ++i) - { - if (bitmask & (1 << i)) - { - indices.push_back(C(true)); - } - else - { - indices.push_back(C(false)); - } - } - return ConstantVector::get(indices); - } - - ////////////////////////////////////////////////////////////////////////// - // @brief processes a single decl from the streamout stream. Reads 4 components from the input - // stream and writes N components to the output buffer given the componentMask or if - // a hole, just increments the buffer pointer - // @param pStream - pointer to current attribute - // @param pOutBuffers - pointers to the current location of each output buffer - // @param decl - input decl - void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl) - { - uint32_t numComponents = _mm_popcnt_u32(decl.componentMask); - uint32_t packedMask = (1 << numComponents) - 1; - if (!decl.hole) - { - // increment stream pointer to correct slot - Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot)); - - // load 4 components from stream - Type* simd4Ty = getVectorType(IRB()->getFloatTy(), 4); - Type* simd4PtrTy = PointerType::get(simd4Ty, 0); - pAttrib = BITCAST(pAttrib, simd4PtrTy); - Value* vattrib = LOAD(pAttrib); - - // shuffle/pack enabled components - Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask)); - - // store to output buffer - // cast SO buffer to i8*, needed by maskstore - Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(simd4Ty, 0)); - - // cast input to <4xfloat> - Value* src = BITCAST(vpackedAttrib, simd4Ty); - - // cast mask to <4xi1> - Value* mask = ToMask(packedMask); - MASKED_STORE(src, pOut, 4, mask, PointerType::get(simd4Ty, 0), MEM_CLIENT::GFX_MEM_CLIENT_STREAMOUT); - } - - // increment SO buffer - pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents)); - } - - ////////////////////////////////////////////////////////////////////////// - // @brief builds a single vertex worth of data for the given stream - // @param streamState - state for this stream - // @param pCurVertex - pointer to src stream vertex data - // @param pOutBuffer - pointers to up to 4 SO buffers - void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4]) - { - for (uint32_t d = 0; d < streamState.numDecls; ++d) - { - const STREAMOUT_DECL& decl = streamState.decl[d]; - buildDecl(pCurVertex, pOutBuffer, decl); - } - } - - void buildStream(const STREAMOUT_COMPILE_STATE& state, - const STREAMOUT_STREAM& streamState, - Value* pSoCtx, - BasicBlock* returnBB, - Function* soFunc) - { - // get list of active SO buffers - std::unordered_set<uint32_t> activeSOBuffers; - for (uint32_t d = 0; d < streamState.numDecls; ++d) - { - const STREAMOUT_DECL& decl = streamState.decl[d]; - activeSOBuffers.insert(decl.bufferIndex); - } - - // always increment numPrimStorageNeeded - Value* numPrimStorageNeeded = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded}); - numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1)); - STORE(numPrimStorageNeeded, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded}); - - // check OOB on active SO buffers. If any buffer is out of bound, don't write - // the primitive to any buffer - Value* oobMask = C(false); - for (uint32_t buffer : activeSOBuffers) - { - oobMask = OR(oobMask, oob(state, pSoCtx, buffer)); - } - - BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc); - - // early out if OOB - COND_BR(oobMask, returnBB, validBB); - - IRB()->SetInsertPoint(validBB); - - Value* numPrimsWritten = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten}); - numPrimsWritten = ADD(numPrimsWritten, C(1)); - STORE(numPrimsWritten, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten}); - - // compute start pointer for each output buffer - Value* pOutBuffer[4]; - Value* pOutBufferStartVertex[4]; - Value* outBufferPitch[4]; - for (uint32_t b : activeSOBuffers) - { - Value* pBuf = getSOBuffer(pSoCtx, b); - Value* pData = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pBuffer}); - Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset}); - pOutBuffer[b] = GEP(pData, streamOffset, PointerType::get(IRB()->getInt32Ty(), 0)); - pOutBufferStartVertex[b] = pOutBuffer[b]; - - outBufferPitch[b] = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch}); - } - - // loop over the vertices of the prim - Value* pStreamData = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pPrimData}); - for (uint32_t v = 0; v < state.numVertsPerPrim; ++v) - { - buildVertex(streamState, pStreamData, pOutBuffer); - - // increment stream and output buffer pointers - // stream verts are always 32*4 dwords apart - pStreamData = GEP(pStreamData, C(SWR_VTX_NUM_SLOTS * 4)); - - // output buffers offset using pitch in buffer state - for (uint32_t b : activeSOBuffers) - { - pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]); - pOutBuffer[b] = pOutBufferStartVertex[b]; - } - } - - // update each active buffer's streamOffset - for (uint32_t b : activeSOBuffers) - { - Value* pBuf = getSOBuffer(pSoCtx, b); - Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset}); - streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b])); - STORE(streamOffset, pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset}); - } - } - - Function* Create(const STREAMOUT_COMPILE_STATE& state) - { - std::stringstream fnName("SO_", - std::ios_base::in | std::ios_base::out | std::ios_base::ate); - fnName << ComputeCRC(0, &state, sizeof(state)); - - std::vector<Type*> args{ - mInt8PtrTy, - mInt8PtrTy, - PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT* - }; - - FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); - Function* soFunc = Function::Create( - fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); - - soFunc->getParent()->setModuleIdentifier(soFunc->getName()); - - // create return basic block - BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc); - BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc); - - IRB()->SetInsertPoint(entry); - - // arguments - auto argitr = soFunc->arg_begin(); - - Value* privateContext = &*argitr++; - privateContext->setName("privateContext"); - SetPrivateContext(privateContext); - - mpWorkerData = &*argitr; - ++argitr; - mpWorkerData->setName("pWorkerData"); - - Value* pSoCtx = &*argitr++; - pSoCtx->setName("pSoCtx"); - - const STREAMOUT_STREAM& streamState = state.stream; - buildStream(state, streamState, pSoCtx, returnBB, soFunc); - - BR(returnBB); - - IRB()->SetInsertPoint(returnBB); - RET_VOID(); - - JitManager::DumpToFile(soFunc, "SoFunc"); - - ::FunctionPassManager passes(JM()->mpCurrentModule); - - passes.add(createBreakCriticalEdgesPass()); - passes.add(createCFGSimplificationPass()); - passes.add(createEarlyCSEPass()); - passes.add(createPromoteMemoryToRegisterPass()); - passes.add(createCFGSimplificationPass()); - passes.add(createEarlyCSEPass()); - passes.add(createInstructionCombiningPass()); -#if LLVM_VERSION_MAJOR <= 11 - passes.add(createConstantPropagationPass()); -#endif - passes.add(createSCCPPass()); - passes.add(createAggressiveDCEPass()); - - passes.add(createLowerX86Pass(this)); - - passes.run(*soFunc); - - JitManager::DumpToFile(soFunc, "SoFunc_optimized"); - - - return soFunc; - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief JITs from streamout shader IR -/// @param hJitMgr - JitManager handle -/// @param func - LLVM function IR -/// @return PFN_SO_FUNC - pointer to SOS function -PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc) -{ - llvm::Function* func = (llvm::Function*)hFunc; - JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); - PFN_SO_FUNC pfnStreamOut; - pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); - // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot - // add new IR to the module - pJitMgr->mIsModuleFinalized = true; - - pJitMgr->DumpAsm(func, "SoFunc_optimized"); - - - return pfnStreamOut; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief JIT compiles streamout shader -/// @param hJitMgr - JitManager handle -/// @param state - SO state to build function from -extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, - const STREAMOUT_COMPILE_STATE& state) -{ - JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); - - STREAMOUT_COMPILE_STATE soState = state; - if (soState.offsetAttribs) - { - for (uint32_t i = 0; i < soState.stream.numDecls; ++i) - { - soState.stream.decl[i].attribSlot -= soState.offsetAttribs; - } - } - - pJitMgr->SetupNewModule(); - - StreamOutJit theJit(pJitMgr); - HANDLE hFunc = theJit.Create(soState); - - return JitStreamoutFunc(hJitMgr, hFunc); -} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h deleted file mode 100644 index d76fcdd5742..00000000000 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h +++ /dev/null @@ -1,101 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file streamout_jit.h - * - * @brief Definition of the streamout jitter - * - * Notes: - * - ******************************************************************************/ -#pragma once - -#include "common/formats.h" -#include "core/state.h" - -////////////////////////////////////////////////////////////////////////// -/// STREAMOUT_DECL - Stream decl -////////////////////////////////////////////////////////////////////////// -struct STREAMOUT_DECL -{ - // Buffer that stream maps to. - DWORD bufferIndex; - - // attribute to stream - uint32_t attribSlot; - - // attribute component mask - uint32_t componentMask; - - // indicates this decl is a hole - bool hole; -}; - -////////////////////////////////////////////////////////////////////////// -/// STREAMOUT_STREAM - Stream decls -////////////////////////////////////////////////////////////////////////// -struct STREAMOUT_STREAM -{ - // number of decls for this stream - uint32_t numDecls; - - // array of numDecls decls - STREAMOUT_DECL decl[128]; -}; - -////////////////////////////////////////////////////////////////////////// -/// State required for streamout jit -////////////////////////////////////////////////////////////////////////// -struct STREAMOUT_COMPILE_STATE -{ - // number of verts per primitive - uint32_t numVertsPerPrim; - uint32_t - offsetAttribs; ///< attrib offset to subtract from all STREAMOUT_DECL::attribSlot values. - - uint64_t streamMask; - - // stream decls - STREAMOUT_STREAM stream; - - bool operator==(const STREAMOUT_COMPILE_STATE& other) const - { - if (numVertsPerPrim != other.numVertsPerPrim) - return false; - if (stream.numDecls != other.stream.numDecls) - return false; - - for (uint32_t i = 0; i < stream.numDecls; ++i) - { - if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) - return false; - if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) - return false; - if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) - return false; - if (stream.decl[i].hole != other.stream.decl[i].hole) - return false; - } - - return true; - } -}; diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp deleted file mode 100644 index 6a528b6a0f2..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp +++ /dev/null @@ -1,305 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file ClearTile.cpp -* -* @brief Functionality for ClearTile. StoreHotTileClear clears a single macro -* tile in the destination. -* -******************************************************************************/ -#include "common/os.h" -#include "core/context.h" -#include "common/formats.h" -#include "memory/TilingFunctions.h" -#include "memory/tilingtraits.h" -#include "memory/Convert.h" - -typedef void(*PFN_STORE_TILES_CLEAR)(const float*, SWR_SURFACE_STATE*, UINT, UINT, uint32_t); - -////////////////////////////////////////////////////////////////////////// -/// Clear Raster Tile Function Tables. -////////////////////////////////////////////////////////////////////////// -static PFN_STORE_TILES_CLEAR sStoreTilesClearColorTable[NUM_SWR_FORMATS]; - -static PFN_STORE_TILES_CLEAR sStoreTilesClearDepthTable[NUM_SWR_FORMATS]; - -////////////////////////////////////////////////////////////////////////// -/// StoreRasterTileClear -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct StoreRasterTileClear -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pColor - Pointer to clear color. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void StoreClear( - const uint8_t* dstFormattedColor, - UINT dstBytesPerPixel, - SWR_SURFACE_STATE* pDstSurface, - UINT x, UINT y, // (x, y) pixel coordinate to start of raster tile. - uint32_t renderTargetArrayIndex) - { - // If we're outside of the surface, stop. - uint32_t lodWidth = std::max<uint32_t>(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max<uint32_t>(pDstSurface->height >> pDstSurface->lod, 1U); - if (x >= lodWidth || y >= lodHeight) - return; - - // Compute destination address for raster tile. - uint8_t* pDstTile = (uint8_t*)ComputeSurfaceAddress<false, false>( - x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, - 0, // sampleNum - pDstSurface->lod, - pDstSurface); - - // start of first row - uint8_t* pDst = pDstTile; - UINT dstBytesPerRow = 0; - - // For each raster tile pixel in row 0 (rx, 0) - for (UINT rx = 0; (rx < KNOB_TILE_X_DIM) && ((x + rx) < lodWidth); ++rx) - { - memcpy(pDst, dstFormattedColor, dstBytesPerPixel); - - // Increment pointer to next pixel in row. - pDst += dstBytesPerPixel; - dstBytesPerRow += dstBytesPerPixel; - } - - // start of second row - pDst = pDstTile + pDstSurface->pitch; - - // For each remaining row in the rest of the raster tile - for (UINT ry = 1; (ry < KNOB_TILE_Y_DIM) && ((y + ry) < lodHeight); ++ry) - { - // copy row - memcpy(pDst, pDstTile, dstBytesPerRow); - - // Increment pointer to first pixel in next row. - pDst += pDstSurface->pitch; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StoreMacroTileClear - Stores a macro tile clear to its raster tiles. -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct StoreMacroTileClear -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores a macrotile to the destination surface. - /// @param pColor - Pointer to color to write to pixels. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to macro tile - static void StoreClear( - const float *pColor, - SWR_SURFACE_STATE* pDstSurface, - UINT x, UINT y, uint32_t renderTargetArrayIndex) - { - UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8); - - uint8_t dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel - - float srcColor[4]; - - for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) - { - srcColor[comp] = pColor[FormatTraits<DstFormat>::swizzle(comp)]; - } - - // using this helper function, but the Tiling Traits is unused inside it so just using a dummy value - ConvertPixelFromFloat<DstFormat>(dstFormattedColor, srcColor); - - // Store each raster tile from the hot tile to the destination surface. - // TODO: Put in check for partial coverage on x/y -- SWR_ASSERT if it happens. - // Intent is for this function to only handle full tiles. - for (UINT row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (UINT col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - StoreRasterTileClear<SrcFormat, DstFormat>::StoreClear(dstFormattedColor, dstBytesPerPixel, pDstSurface, (x + col), (y + row), renderTargetArrayIndex); - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Writes clear color to every pixel of a render surface -/// @param hPrivateContext - Handle to private DC -/// @param renderTargetIndex - Index to destination render target -/// @param x, y - Coordinates to raster tile. -/// @param pClearColor - Pointer to clear color -void SwrStoreHotTileClear( - HANDLE hWorkerPrivateData, - SWR_SURFACE_STATE *pDstSurface, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - UINT x, - UINT y, - uint32_t renderTargetArrayIndex, - const float* pClearColor) -{ - PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL; - - if (renderTargetIndex == SWR_ATTACHMENT_STENCIL) - { - SWR_ASSERT(pDstSurface->format == R8_UINT); - pfnStoreTilesClear = StoreMacroTileClear<R8_UINT, R8_UINT>::StoreClear; - } - else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH) - { - pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format]; - } - else - { - pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format]; - } - - SWR_ASSERT(pfnStoreTilesClear != NULL); - - // Store a macro tile. - /// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress. - if (pfnStoreTilesClear != NULL) - { - pfnStoreTilesClear(pClearColor, pDstSurface, x, y, renderTargetArrayIndex); - } -} - -////////////////////////////////////////////////////////////////////////// -/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. -#define INIT_STORE_TILES_CLEAR_COLOR_TABLE() \ - memset(sStoreTilesClearColorTable, 0, sizeof(sStoreTilesClearColorTable)); \ - \ - sStoreTilesClearColorTable[R32G32B32A32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[R32G32B32A32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R32G32B32A32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R32G32B32X32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[R32G32B32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[R32G32B32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R32G32B32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16A16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16A16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16A16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16A16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16A16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[R32G32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[R32G32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R32G32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16X16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16X16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[B8G8R8A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[B8G8R8A8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[R10G10B10A2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R10G10B10A2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[R10G10B10A2_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8A8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8A8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8A8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8A8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R16G16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R16G16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[R16G16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R16G16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R16G16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[B10G10R10A2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[B10G10R10A2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[R11G11B10_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[R32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[A32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, A32_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[B8G8R8X8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[B8G8R8X8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8X8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8X8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[B10G10R10X2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[B5G6R5_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[B5G6R5_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[B5G5R5A1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[B5G5R5A1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[B4G4R4A4_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[B4G4R4A4_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[R8G8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R8G8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[R8G8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R8G8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[R16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[A16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[A16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[B5G5R5X1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[B5G5R5X1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[R8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[R8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UINT>::StoreClear; \ - sStoreTilesClearColorTable[A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, A8_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[BC1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[BC2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[BC3_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[BC4_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[BC5_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[BC1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[BC2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[BC3_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[BC4_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[BC5_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_FLOAT>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UNORM>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R16G16B16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R10G10B10A2_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[R10G10B10A2_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreClear; \ - sStoreTilesClearColorTable[B10G10R10A2_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreClear; \ - sStoreTilesClearColorTable[B10G10R10A2_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreClear; \ - sStoreTilesClearColorTable[B10G10R10A2_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UINT>::StoreClear; \ - sStoreTilesClearColorTable[R8G8B8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SINT>::StoreClear; - -////////////////////////////////////////////////////////////////////////// -/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. -#define INIT_STORE_TILES_CLEAR_DEPTH_TABLE() \ - memset(sStoreTilesClearDepthTable, 0, sizeof(sStoreTilesClearDepthTable)); \ - \ - sStoreTilesClearDepthTable[R32_FLOAT] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT>::StoreClear; \ - sStoreTilesClearDepthTable[R32_FLOAT_X8X24_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::StoreClear; \ - sStoreTilesClearDepthTable[R24_UNORM_X8_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreClear; \ - sStoreTilesClearDepthTable[R16_UNORM] = StoreMacroTileClear<R32_FLOAT, R16_UNORM>::StoreClear; - -////////////////////////////////////////////////////////////////////////// -/// @brief Sets up tables for ClearTile -void InitSimClearTilesTable() -{ - INIT_STORE_TILES_CLEAR_COLOR_TABLE(); - INIT_STORE_TILES_CLEAR_DEPTH_TABLE(); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h deleted file mode 100644 index c8c6b30daff..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/Convert.h +++ /dev/null @@ -1,730 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file Convert.h -* -* @brief Conversion utility functions -* -******************************************************************************/ -#pragma once - -#if defined(_MSC_VER) -// disable "potential divide by 0" -#pragma warning(disable: 4723) -#endif - -#include <cmath> - -////////////////////////////////////////////////////////////////////////// -/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision -/// float -/// @param val - 16-bit float -/// @todo Maybe move this outside of this file into a header? -static INLINE float ConvertSmallFloatTo32(UINT val) -{ - UINT result; - if ((val & 0x7fff) == 0) - { - result = ((uint32_t)(val & 0x8000)) << 16; - } - else if ((val & 0x7c00) == 0x7c00) - { - result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000; - result |= ((uint32_t)val & 0x8000) << 16; - } - else - { - uint32_t sign = (val & 0x8000) << 16; - uint32_t mant = (val & 0x3ff) << 13; - uint32_t exp = (val >> 10) & 0x1f; - if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals - { - mant <<= 1; - while (mant < (0x400 << 13)) - { - exp--; - mant <<= 1; - } - mant &= (0x3ff << 13); - } - exp = ((exp - 15 + 127) & 0xff) << 23; - result = sign | exp | mant; - } - - return *(float*)&result; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Convert an IEEE 754 32-bit single precision float to an -/// unsigned small float with 5 exponent bits and a variable -/// number of mantissa bits. -/// @param val - 32-bit float -/// @todo Maybe move this outside of this file into a header? -template<UINT numMantissaBits> -static UINT Convert32ToSmallFloat(float val) -{ - uint32_t sign, exp, mant; - uint32_t roundBits; - - // Extract the sign, exponent, and mantissa - UINT uf = *(UINT*)&val; - - sign = (uf & 0x80000000) >> 31; - exp = (uf & 0x7F800000) >> 23; - mant = uf & 0x007FFFFF; - - // 10/11 bit floats are unsigned. Negative values are clamped to 0. - if (sign != 0) - { - exp = mant = 0; - } - // Check for out of range - else if ((exp == 0xFF) && (mant != 0)) // NaN - { - exp = 0x1F; - mant = 1 << numMantissaBits; - } - else if ((exp == 0xFF) && (mant == 0)) // INF - { - exp = 0x1F; - mant = 0; - } - else if (exp > (0x70 + 0x1E)) // Too big to represent - { - exp = 0x1Eu; - mant = (1 << numMantissaBits) - 1; // 0x3F for 6 bit mantissa. - } - else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm - { - mant |= 0x00800000; - for (; exp <= 0x70; mant >>= 1, exp++) - ; - exp = 0; - mant = mant >> (23 - numMantissaBits); - } - else if (exp < 0x66) // Too small to represent -> Zero - { - exp = 0; - mant = 0; - } - else - { - // Saves bits that will be shifted off for rounding - roundBits = mant & 0x1FFFu; - // convert exponent and mantissa to 16 bit format - exp = exp - 0x70u; - mant = mant >> (23 - numMantissaBits); - - // Essentially RTZ, but round up if off by only 1 lsb - if (roundBits == 0x1FFFu) - { - mant++; - // check for overflow - if ((mant & (0x3 << numMantissaBits)) != 0) // 0x60 = 0x3 << (num Mantissa Bits) - exp++; - // make sure only the needed bits are used - mant &= (1 << numMantissaBits) - 1; - } - } - - UINT tmpVal = (exp << numMantissaBits) | mant; - return tmpVal; -} - -#if KNOB_ARCH == KNOB_ARCH_AVX -////////////////////////////////////////////////////////////////////////// -/// @brief Convert an IEEE 754 32-bit single precision float to an -/// 16 bit float with 5 exponent bits and a variable -/// number of mantissa bits. -/// @param val - 32-bit float -/// @todo Maybe move this outside of this file into a header? -static uint16_t Convert32To16Float(float val) -{ - uint32_t sign, exp, mant; - uint32_t roundBits; - - // Extract the sign, exponent, and mantissa - uint32_t uf = *(uint32_t*)&val; - sign = (uf & 0x80000000) >> 31; - exp = (uf & 0x7F800000) >> 23; - mant = uf & 0x007FFFFF; - - // Check for out of range - if (std::isnan(val)) - { - exp = 0x1F; - mant = 0x200; - sign = 1; // set the sign bit for NANs - } - else if (std::isinf(val)) - { - exp = 0x1f; - mant = 0x0; - } - else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value - { - exp = 0x1E; - mant = 0x3FF; - } - else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm - { - mant |= 0x00800000; - for (; exp <= 0x70; mant >>= 1, exp++) - ; - exp = 0; - mant = mant >> 13; - } - else if (exp < 0x66) // Too small to represent -> Zero - { - exp = 0; - mant = 0; - } - else - { - // Saves bits that will be shifted off for rounding - roundBits = mant & 0x1FFFu; - // convert exponent and mantissa to 16 bit format - exp = exp - 0x70; - mant = mant >> 13; - - // Essentially RTZ, but round up if off by only 1 lsb - if (roundBits == 0x1FFFu) - { - mant++; - // check for overflow - if ((mant & 0xC00u) != 0) - exp++; - // make sure only the needed bits are used - mant &= 0x3FF; - } - } - - uint32_t tmpVal = (sign << 15) | (exp << 10) | mant; - return (uint16_t)tmpVal; -} -#endif - -////////////////////////////////////////////////////////////////////////// -/// @brief Retrieve color from hot tile source which is always float. -/// @param pDstPixel - Pointer to destination pixel. -/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest). -template<SWR_FORMAT DstFormat> -static void ConvertPixelFromFloat( - uint8_t* pDstPixel, - const float srcPixel[4]) -{ - uint32_t outColor[4] = { 0 }; // typeless bits - - // Store component - for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) - { - SWR_TYPE type = FormatTraits<DstFormat>::GetType(comp); - - float src = srcPixel[comp]; - - switch (type) - { - case SWR_TYPE_UNORM: - { - // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false. - src = (src != src) ? 0.0f : src; - - // Clamp [0, 1] - src = std::max(src, 0.0f); - src = std::min(src, 1.0f); - - // SRGB - if (FormatTraits<DstFormat>::isSRGB && comp != 3) - { - src = (src <= 0.0031308f) ? (12.92f * src) : (1.055f * powf(src, (1.0f / 2.4f)) - 0.055f); - } - - // Float scale to integer scale. - UINT scale = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1; - src = (float)scale * src; - src = roundf(src); - outColor[comp] = (UINT)src; // Drop fractional part. - break; - } - case SWR_TYPE_SNORM: - { - SWR_ASSERT(!FormatTraits<DstFormat>::isSRGB); - - // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false. - src = (src != src) ? 0.0f : src; - - // Clamp [-1, 1] - src = std::max(src, -1.0f); - src = std::min(src, 1.0f); - - // Float scale to integer scale. - UINT scale = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1; - src = (float)scale * src; - - // Round - src += (src >= 0) ? 0.5f : -0.5f; - - INT out = (INT)src; - - outColor[comp] = *(UINT*)&out; - - break; - } - case SWR_TYPE_UINT: - { - ///@note The *(UINT*)& is currently necessary as the hot tile appears to always be float. - // However, the number in the hot tile should be unsigned integer. So doing this - // to preserve bits intead of doing a float -> integer conversion. - if (FormatTraits<DstFormat>::GetBPC(comp) == 32) - { - outColor[comp] = *(UINT*)&src; - } - else - { - outColor[comp] = *(UINT*)&src; - UINT max = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1; // 2^numBits - 1 - - outColor[comp] = std::min(max, outColor[comp]); - } - break; - } - case SWR_TYPE_SINT: - { - if (FormatTraits<DstFormat>::GetBPC(comp) == 32) - { - outColor[comp] = *(UINT*)&src; - } - else - { - INT out = *(INT*)&src; // Hot tile format is SINT? - INT max = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1; - INT min = -1 - max; - - ///@note The output is unsigned integer (bag of bits) and so performing - // the clamping here based on range of output component. Also, manually adding - // the sign bit in the appropriate spot. Maybe a better way? - out = std::max(out, min); - out = std::min(out, max); - - outColor[comp] = *(UINT*)&out; - } - break; - } - case SWR_TYPE_FLOAT: - { - if (FormatTraits<DstFormat>::GetBPC(comp) == 16) - { - // Convert from 32-bit float to 16-bit float using _mm_cvtps_ph - // @todo 16bit float instruction support is orthogonal to avx support. need to - // add check for F16C support instead. -#if KNOB_ARCH >= KNOB_ARCH_AVX2 - __m128 src128 = _mm_set1_ps(src); - __m128i srci128 = _mm_cvtps_ph(src128, _MM_FROUND_TRUNC); - UINT value = _mm_extract_epi16(srci128, 0); -#else - UINT value = Convert32To16Float(src); -#endif - - outColor[comp] = value; - } - else if (FormatTraits<DstFormat>::GetBPC(comp) == 11) - { - outColor[comp] = Convert32ToSmallFloat<6>(src); - } - else if (FormatTraits<DstFormat>::GetBPC(comp) == 10) - { - outColor[comp] = Convert32ToSmallFloat<5>(src); - } - else - { - outColor[comp] = *(UINT*)&src; - } - - break; - } - default: - SWR_INVALID("Invalid type: %d", type); - break; - } - } - - typename FormatTraits<DstFormat>::FormatT* pPixel = (typename FormatTraits<DstFormat>::FormatT*)pDstPixel; - - switch (FormatTraits<DstFormat>::numComps) - { - case 4: - pPixel->a = outColor[3]; - case 3: - pPixel->b = outColor[2]; - case 2: - pPixel->g = outColor[1]; - case 1: - pPixel->r = outColor[0]; - break; - default: - SWR_INVALID("Invalid # of comps: %d", FormatTraits<DstFormat>::numComps); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Convert pixel in any format to float32 -/// @param pDstPixel - Pointer to destination pixel. -/// @param srcPixel - Pointer to source pixel -template<SWR_FORMAT SrcFormat> -INLINE static void ConvertPixelToFloat( - float dstPixel[4], - const uint8_t* pSrc) -{ - uint32_t srcColor[4]; // typeless bits - - // unpack src pixel - typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc; - - // apply format defaults - for (uint32_t comp = 0; comp < 4; ++comp) - { - uint32_t def = FormatTraits<SrcFormat>::GetDefault(comp); - dstPixel[comp] = *(float*)&def; - } - - // load format data - switch (FormatTraits<SrcFormat>::numComps) - { - case 4: - srcColor[3] = pPixel->a; - case 3: - srcColor[2] = pPixel->b; - case 2: - srcColor[1] = pPixel->g; - case 1: - srcColor[0] = pPixel->r; - break; - default: - SWR_INVALID("Invalid # of comps: %d", FormatTraits<SrcFormat>::numComps); - } - - // Convert components - for (uint32_t comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp) - { - SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp); - - uint32_t src = srcColor[comp]; - - switch (type) - { - case SWR_TYPE_UNORM: - { - float dst; - if (FormatTraits<SrcFormat>::isSRGB && comp != 3) - { - dst = *(float*)&srgb8Table[src]; - } - else - { - // component sizes > 16 must use fp divide to maintain ulp requirements - if (FormatTraits<SrcFormat>::GetBPC(comp) > 16) - { - dst = (float)src / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1); - } - else - { - const float scale = (1.0f / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1)); - dst = (float)src * scale; - } - } - dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst; - break; - } - case SWR_TYPE_SNORM: - { - SWR_ASSERT(!FormatTraits<SrcFormat>::isSRGB); - - float dst; - if (src == 0x10) - { - dst = -1.0f; - } - else - { - switch (FormatTraits<SrcFormat>::GetBPC(comp)) - { - case 8: - dst = (float)((int8_t)src); - break; - case 16: - dst = (float)((int16_t)src); - break; - case 32: - dst = (float)((int32_t)src); - break; - default: - assert(0 && "attempted to load from SNORM with unsupported bpc"); - dst = 0.0f; - break; - } - dst = dst * (1.0f / ((1 << (FormatTraits<SrcFormat>::GetBPC(comp) - 1)) - 1)); - } - dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst; - break; - } - case SWR_TYPE_UINT: - { - uint32_t dst = (uint32_t)src; - dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst; - break; - } - case SWR_TYPE_SINT: - { - int dst; - switch (FormatTraits<SrcFormat>::GetBPC(comp)) - { - case 8: - dst = (int8_t)src; - break; - case 16: - dst = (int16_t)src; - break; - case 32: - dst = (int32_t)src; - break; - default: - assert(0 && "attempted to load from SINT with unsupported bpc"); - dst = 0; - break; - } - dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst; - break; - } - case SWR_TYPE_FLOAT: - { - float dst; - if (FormatTraits<SrcFormat>::GetBPC(comp) == 16) - { -#if KNOB_ARCH >= KNOB_ARCH_AVX2 - // Convert from 16-bit float to 32-bit float using _mm_cvtph_ps - // @todo 16bit float instruction support is orthogonal to avx support. need to - // add check for F16C support instead. - __m128i src128 = _mm_set1_epi32(src); - __m128 res = _mm_cvtph_ps(src128); - _mm_store_ss(&dst, res); -#else - dst = ConvertSmallFloatTo32(src); -#endif - } - else if (FormatTraits<SrcFormat>::GetBPC(comp) == 11) - { - dst = ConvertSmallFloatTo32(src << 4); - } - else if (FormatTraits<SrcFormat>::GetBPC(comp) == 10) - { - dst = ConvertSmallFloatTo32(src << 5); - } - else - { - dst = *(float*)&src; - } - - dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst; - break; - } - default: - SWR_INVALID("Invalid type: %d", type); - break; - } - } -} - -// non-templated version of conversion functions -INLINE static void ConvertPixelFromFloat( - SWR_FORMAT format, - uint8_t* pDst, - const float srcPixel[4]) -{ - switch (format) - { - case R32G32B32A32_FLOAT: ConvertPixelFromFloat<R32G32B32A32_FLOAT>(pDst, srcPixel); break; - case R32G32B32A32_SINT: ConvertPixelFromFloat<R32G32B32A32_SINT>(pDst, srcPixel); break; - case R32G32B32A32_UINT: ConvertPixelFromFloat<R32G32B32A32_UINT>(pDst, srcPixel); break; - case R32G32B32X32_FLOAT: ConvertPixelFromFloat<R32G32B32X32_FLOAT>(pDst, srcPixel); break; - case R32G32B32A32_SSCALED: ConvertPixelFromFloat<R32G32B32A32_SSCALED>(pDst, srcPixel); break; - case R32G32B32A32_USCALED: ConvertPixelFromFloat<R32G32B32A32_USCALED>(pDst, srcPixel); break; - case R32G32B32_FLOAT: ConvertPixelFromFloat<R32G32B32_FLOAT>(pDst, srcPixel); break; - case R32G32B32_SINT: ConvertPixelFromFloat<R32G32B32_SINT>(pDst, srcPixel); break; - case R32G32B32_UINT: ConvertPixelFromFloat<R32G32B32_UINT>(pDst, srcPixel); break; - case R32G32B32_SSCALED: ConvertPixelFromFloat<R32G32B32_SSCALED>(pDst, srcPixel); break; - case R32G32B32_USCALED: ConvertPixelFromFloat<R32G32B32_USCALED>(pDst, srcPixel); break; - case R16G16B16A16_UNORM: ConvertPixelFromFloat<R16G16B16A16_UNORM>(pDst, srcPixel); break; - case R16G16B16A16_SNORM: ConvertPixelFromFloat<R16G16B16A16_SNORM>(pDst, srcPixel); break; - case R16G16B16A16_SINT: ConvertPixelFromFloat<R16G16B16A16_SINT>(pDst, srcPixel); break; - case R16G16B16A16_UINT: ConvertPixelFromFloat<R16G16B16A16_UINT>(pDst, srcPixel); break; - case R16G16B16A16_FLOAT: ConvertPixelFromFloat<R16G16B16A16_FLOAT>(pDst, srcPixel); break; - case R32G32_FLOAT: ConvertPixelFromFloat<R32G32_FLOAT>(pDst, srcPixel); break; - case R32G32_SINT: ConvertPixelFromFloat<R32G32_SINT>(pDst, srcPixel); break; - case R32G32_UINT: ConvertPixelFromFloat<R32G32_UINT>(pDst, srcPixel); break; - case R32_FLOAT_X8X24_TYPELESS: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS>(pDst, srcPixel); break; - case X32_TYPELESS_G8X24_UINT: ConvertPixelFromFloat<X32_TYPELESS_G8X24_UINT>(pDst, srcPixel); break; - case L32A32_FLOAT: ConvertPixelFromFloat<L32A32_FLOAT>(pDst, srcPixel); break; - case R16G16B16X16_UNORM: ConvertPixelFromFloat<R16G16B16X16_UNORM>(pDst, srcPixel); break; - case R16G16B16X16_FLOAT: ConvertPixelFromFloat<R16G16B16X16_FLOAT>(pDst, srcPixel); break; - case L32X32_FLOAT: ConvertPixelFromFloat<L32X32_FLOAT>(pDst, srcPixel); break; - case I32X32_FLOAT: ConvertPixelFromFloat<I32X32_FLOAT>(pDst, srcPixel); break; - case R16G16B16A16_SSCALED: ConvertPixelFromFloat<R16G16B16A16_SSCALED>(pDst, srcPixel); break; - case R16G16B16A16_USCALED: ConvertPixelFromFloat<R16G16B16A16_USCALED>(pDst, srcPixel); break; - case R32G32_SSCALED: ConvertPixelFromFloat<R32G32_SSCALED>(pDst, srcPixel); break; - case R32G32_USCALED: ConvertPixelFromFloat<R32G32_USCALED>(pDst, srcPixel); break; - case B8G8R8A8_UNORM: ConvertPixelFromFloat<B8G8R8A8_UNORM>(pDst, srcPixel); break; - case B8G8R8A8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8A8_UNORM_SRGB>(pDst, srcPixel); break; - case R10G10B10A2_UNORM: ConvertPixelFromFloat<R10G10B10A2_UNORM>(pDst, srcPixel); break; - case R10G10B10A2_UNORM_SRGB: ConvertPixelFromFloat<R10G10B10A2_UNORM_SRGB>(pDst, srcPixel); break; - case R10G10B10A2_UINT: ConvertPixelFromFloat<R10G10B10A2_UINT>(pDst, srcPixel); break; - case R8G8B8A8_UNORM: ConvertPixelFromFloat<R8G8B8A8_UNORM>(pDst, srcPixel); break; - case R8G8B8A8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8A8_UNORM_SRGB>(pDst, srcPixel); break; - case R8G8B8A8_SNORM: ConvertPixelFromFloat<R8G8B8A8_SNORM>(pDst, srcPixel); break; - case R8G8B8A8_SINT: ConvertPixelFromFloat<R8G8B8A8_SINT>(pDst, srcPixel); break; - case R8G8B8A8_UINT: ConvertPixelFromFloat<R8G8B8A8_UINT>(pDst, srcPixel); break; - case R16G16_UNORM: ConvertPixelFromFloat<R16G16_UNORM>(pDst, srcPixel); break; - case R16G16_SNORM: ConvertPixelFromFloat<R16G16_SNORM>(pDst, srcPixel); break; - case R16G16_SINT: ConvertPixelFromFloat<R16G16_SINT>(pDst, srcPixel); break; - case R16G16_UINT: ConvertPixelFromFloat<R16G16_UINT>(pDst, srcPixel); break; - case R16G16_FLOAT: ConvertPixelFromFloat<R16G16_FLOAT>(pDst, srcPixel); break; - case B10G10R10A2_UNORM: ConvertPixelFromFloat<B10G10R10A2_UNORM>(pDst, srcPixel); break; - case B10G10R10A2_UNORM_SRGB: ConvertPixelFromFloat<B10G10R10A2_UNORM_SRGB>(pDst, srcPixel); break; - case R11G11B10_FLOAT: ConvertPixelFromFloat<R11G11B10_FLOAT>(pDst, srcPixel); break; - case R10G10B10_FLOAT_A2_UNORM: ConvertPixelFromFloat<R10G10B10_FLOAT_A2_UNORM>(pDst, srcPixel); break; - case R32_SINT: ConvertPixelFromFloat<R32_SINT>(pDst, srcPixel); break; - case R32_UINT: ConvertPixelFromFloat<R32_UINT>(pDst, srcPixel); break; - case R32_FLOAT: ConvertPixelFromFloat<R32_FLOAT>(pDst, srcPixel); break; - case R24_UNORM_X8_TYPELESS: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS>(pDst, srcPixel); break; - case X24_TYPELESS_G8_UINT: ConvertPixelFromFloat<X24_TYPELESS_G8_UINT>(pDst, srcPixel); break; - case L32_UNORM: ConvertPixelFromFloat<L32_UNORM>(pDst, srcPixel); break; - case L16A16_UNORM: ConvertPixelFromFloat<L16A16_UNORM>(pDst, srcPixel); break; - case I24X8_UNORM: ConvertPixelFromFloat<I24X8_UNORM>(pDst, srcPixel); break; - case L24X8_UNORM: ConvertPixelFromFloat<L24X8_UNORM>(pDst, srcPixel); break; - case I32_FLOAT: ConvertPixelFromFloat<I32_FLOAT>(pDst, srcPixel); break; - case L32_FLOAT: ConvertPixelFromFloat<L32_FLOAT>(pDst, srcPixel); break; - case A32_FLOAT: ConvertPixelFromFloat<A32_FLOAT>(pDst, srcPixel); break; - case B8G8R8X8_UNORM: ConvertPixelFromFloat<B8G8R8X8_UNORM>(pDst, srcPixel); break; - case B8G8R8X8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8X8_UNORM_SRGB>(pDst, srcPixel); break; - case R8G8B8X8_UNORM: ConvertPixelFromFloat<R8G8B8X8_UNORM>(pDst, srcPixel); break; - case R8G8B8X8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8X8_UNORM_SRGB>(pDst, srcPixel); break; - case R9G9B9E5_SHAREDEXP: ConvertPixelFromFloat<R9G9B9E5_SHAREDEXP>(pDst, srcPixel); break; - case B10G10R10X2_UNORM: ConvertPixelFromFloat<B10G10R10X2_UNORM>(pDst, srcPixel); break; - case L16A16_FLOAT: ConvertPixelFromFloat<L16A16_FLOAT>(pDst, srcPixel); break; - case R10G10B10X2_USCALED: ConvertPixelFromFloat<R10G10B10X2_USCALED>(pDst, srcPixel); break; - case R8G8B8A8_SSCALED: ConvertPixelFromFloat<R8G8B8A8_SSCALED>(pDst, srcPixel); break; - case R8G8B8A8_USCALED: ConvertPixelFromFloat<R8G8B8A8_USCALED>(pDst, srcPixel); break; - case R16G16_SSCALED: ConvertPixelFromFloat<R16G16_SSCALED>(pDst, srcPixel); break; - case R16G16_USCALED: ConvertPixelFromFloat<R16G16_USCALED>(pDst, srcPixel); break; - case R32_SSCALED: ConvertPixelFromFloat<R32_SSCALED>(pDst, srcPixel); break; - case R32_USCALED: ConvertPixelFromFloat<R32_USCALED>(pDst, srcPixel); break; - case B5G6R5_UNORM: ConvertPixelFromFloat<B5G6R5_UNORM>(pDst, srcPixel); break; - case B5G6R5_UNORM_SRGB: ConvertPixelFromFloat<B5G6R5_UNORM_SRGB>(pDst, srcPixel); break; - case B5G5R5A1_UNORM: ConvertPixelFromFloat<B5G5R5A1_UNORM>(pDst, srcPixel); break; - case B5G5R5A1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5A1_UNORM_SRGB>(pDst, srcPixel); break; - case B4G4R4A4_UNORM: ConvertPixelFromFloat<B4G4R4A4_UNORM>(pDst, srcPixel); break; - case B4G4R4A4_UNORM_SRGB: ConvertPixelFromFloat<B4G4R4A4_UNORM_SRGB>(pDst, srcPixel); break; - case R8G8_UNORM: ConvertPixelFromFloat<R8G8_UNORM>(pDst, srcPixel); break; - case R8G8_SNORM: ConvertPixelFromFloat<R8G8_SNORM>(pDst, srcPixel); break; - case R8G8_SINT: ConvertPixelFromFloat<R8G8_SINT>(pDst, srcPixel); break; - case R8G8_UINT: ConvertPixelFromFloat<R8G8_UINT>(pDst, srcPixel); break; - case R16_UNORM: ConvertPixelFromFloat<R16_UNORM>(pDst, srcPixel); break; - case R16_SNORM: ConvertPixelFromFloat<R16_SNORM>(pDst, srcPixel); break; - case R16_SINT: ConvertPixelFromFloat<R16_SINT>(pDst, srcPixel); break; - case R16_UINT: ConvertPixelFromFloat<R16_UINT>(pDst, srcPixel); break; - case R16_FLOAT: ConvertPixelFromFloat<R16_FLOAT>(pDst, srcPixel); break; - case I16_UNORM: ConvertPixelFromFloat<I16_UNORM>(pDst, srcPixel); break; - case L16_UNORM: ConvertPixelFromFloat<L16_UNORM>(pDst, srcPixel); break; - case A16_UNORM: ConvertPixelFromFloat<A16_UNORM>(pDst, srcPixel); break; - case L8A8_UNORM: ConvertPixelFromFloat<L8A8_UNORM>(pDst, srcPixel); break; - case I16_FLOAT: ConvertPixelFromFloat<I16_FLOAT>(pDst, srcPixel); break; - case L16_FLOAT: ConvertPixelFromFloat<L16_FLOAT>(pDst, srcPixel); break; - case A16_FLOAT: ConvertPixelFromFloat<A16_FLOAT>(pDst, srcPixel); break; - case L8A8_UNORM_SRGB: ConvertPixelFromFloat<L8A8_UNORM_SRGB>(pDst, srcPixel); break; - case B5G5R5X1_UNORM: ConvertPixelFromFloat<B5G5R5X1_UNORM>(pDst, srcPixel); break; - case B5G5R5X1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5X1_UNORM_SRGB>(pDst, srcPixel); break; - case R8G8_SSCALED: ConvertPixelFromFloat<R8G8_SSCALED>(pDst, srcPixel); break; - case R8G8_USCALED: ConvertPixelFromFloat<R8G8_USCALED>(pDst, srcPixel); break; - case R16_SSCALED: ConvertPixelFromFloat<R16_SSCALED>(pDst, srcPixel); break; - case R16_USCALED: ConvertPixelFromFloat<R16_USCALED>(pDst, srcPixel); break; - case A1B5G5R5_UNORM: ConvertPixelFromFloat<A1B5G5R5_UNORM>(pDst, srcPixel); break; - case A4B4G4R4_UNORM: ConvertPixelFromFloat<A4B4G4R4_UNORM>(pDst, srcPixel); break; - case L8A8_UINT: ConvertPixelFromFloat<L8A8_UINT>(pDst, srcPixel); break; - case L8A8_SINT: ConvertPixelFromFloat<L8A8_SINT>(pDst, srcPixel); break; - case R8_UNORM: ConvertPixelFromFloat<R8_UNORM>(pDst, srcPixel); break; - case R8_SNORM: ConvertPixelFromFloat<R8_SNORM>(pDst, srcPixel); break; - case R8_SINT: ConvertPixelFromFloat<R8_SINT>(pDst, srcPixel); break; - case R8_UINT: ConvertPixelFromFloat<R8_UINT>(pDst, srcPixel); break; - case A8_UNORM: ConvertPixelFromFloat<A8_UNORM>(pDst, srcPixel); break; - case I8_UNORM: ConvertPixelFromFloat<I8_UNORM>(pDst, srcPixel); break; - case L8_UNORM: ConvertPixelFromFloat<L8_UNORM>(pDst, srcPixel); break; - case R8_SSCALED: ConvertPixelFromFloat<R8_SSCALED>(pDst, srcPixel); break; - case R8_USCALED: ConvertPixelFromFloat<R8_USCALED>(pDst, srcPixel); break; - case L8_UNORM_SRGB: ConvertPixelFromFloat<L8_UNORM_SRGB>(pDst, srcPixel); break; - case L8_UINT: ConvertPixelFromFloat<L8_UINT>(pDst, srcPixel); break; - case L8_SINT: ConvertPixelFromFloat<L8_SINT>(pDst, srcPixel); break; - case I8_UINT: ConvertPixelFromFloat<I8_UINT>(pDst, srcPixel); break; - case I8_SINT: ConvertPixelFromFloat<I8_SINT>(pDst, srcPixel); break; - case YCRCB_SWAPUVY: ConvertPixelFromFloat<YCRCB_SWAPUVY>(pDst, srcPixel); break; - case BC1_UNORM: ConvertPixelFromFloat<BC1_UNORM>(pDst, srcPixel); break; - case BC2_UNORM: ConvertPixelFromFloat<BC2_UNORM>(pDst, srcPixel); break; - case BC3_UNORM: ConvertPixelFromFloat<BC3_UNORM>(pDst, srcPixel); break; - case BC4_UNORM: ConvertPixelFromFloat<BC4_UNORM>(pDst, srcPixel); break; - case BC5_UNORM: ConvertPixelFromFloat<BC5_UNORM>(pDst, srcPixel); break; - case BC1_UNORM_SRGB: ConvertPixelFromFloat<BC1_UNORM_SRGB>(pDst, srcPixel); break; - case BC2_UNORM_SRGB: ConvertPixelFromFloat<BC2_UNORM_SRGB>(pDst, srcPixel); break; - case BC3_UNORM_SRGB: ConvertPixelFromFloat<BC3_UNORM_SRGB>(pDst, srcPixel); break; - case YCRCB_SWAPUV: ConvertPixelFromFloat<YCRCB_SWAPUV>(pDst, srcPixel); break; - case R8G8B8_UNORM: ConvertPixelFromFloat<R8G8B8_UNORM>(pDst, srcPixel); break; - case R8G8B8_SNORM: ConvertPixelFromFloat<R8G8B8_SNORM>(pDst, srcPixel); break; - case R8G8B8_SSCALED: ConvertPixelFromFloat<R8G8B8_SSCALED>(pDst, srcPixel); break; - case R8G8B8_USCALED: ConvertPixelFromFloat<R8G8B8_USCALED>(pDst, srcPixel); break; - case BC4_SNORM: ConvertPixelFromFloat<BC4_SNORM>(pDst, srcPixel); break; - case BC5_SNORM: ConvertPixelFromFloat<BC5_SNORM>(pDst, srcPixel); break; - case R16G16B16_FLOAT: ConvertPixelFromFloat<R16G16B16_FLOAT>(pDst, srcPixel); break; - case R16G16B16_UNORM: ConvertPixelFromFloat<R16G16B16_UNORM>(pDst, srcPixel); break; - case R16G16B16_SNORM: ConvertPixelFromFloat<R16G16B16_SNORM>(pDst, srcPixel); break; - case R16G16B16_SSCALED: ConvertPixelFromFloat<R16G16B16_SSCALED>(pDst, srcPixel); break; - case R16G16B16_USCALED: ConvertPixelFromFloat<R16G16B16_USCALED>(pDst, srcPixel); break; - case BC6H_SF16: ConvertPixelFromFloat<BC6H_SF16>(pDst, srcPixel); break; - case BC7_UNORM: ConvertPixelFromFloat<BC7_UNORM>(pDst, srcPixel); break; - case BC7_UNORM_SRGB: ConvertPixelFromFloat<BC7_UNORM_SRGB>(pDst, srcPixel); break; - case BC6H_UF16: ConvertPixelFromFloat<BC6H_UF16>(pDst, srcPixel); break; - case R8G8B8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8_UNORM_SRGB>(pDst, srcPixel); break; - case R16G16B16_UINT: ConvertPixelFromFloat<R16G16B16_UINT>(pDst, srcPixel); break; - case R16G16B16_SINT: ConvertPixelFromFloat<R16G16B16_SINT>(pDst, srcPixel); break; - case R10G10B10A2_SNORM: ConvertPixelFromFloat<R10G10B10A2_SNORM>(pDst, srcPixel); break; - case R10G10B10A2_USCALED: ConvertPixelFromFloat<R10G10B10A2_USCALED>(pDst, srcPixel); break; - case R10G10B10A2_SSCALED: ConvertPixelFromFloat<R10G10B10A2_SSCALED>(pDst, srcPixel); break; - case R10G10B10A2_SINT: ConvertPixelFromFloat<R10G10B10A2_SINT>(pDst, srcPixel); break; - case B10G10R10A2_SNORM: ConvertPixelFromFloat<B10G10R10A2_SNORM>(pDst, srcPixel); break; - case B10G10R10A2_USCALED: ConvertPixelFromFloat<B10G10R10A2_USCALED>(pDst, srcPixel); break; - case B10G10R10A2_SSCALED: ConvertPixelFromFloat<B10G10R10A2_SSCALED>(pDst, srcPixel); break; - case B10G10R10A2_UINT: ConvertPixelFromFloat<B10G10R10A2_UINT>(pDst, srcPixel); break; - case B10G10R10A2_SINT: ConvertPixelFromFloat<B10G10R10A2_SINT>(pDst, srcPixel); break; - case R8G8B8_UINT: ConvertPixelFromFloat<R8G8B8_UINT>(pDst, srcPixel); break; - case R8G8B8_SINT: ConvertPixelFromFloat<R8G8B8_SINT>(pDst, srcPixel); break; - case RAW: ConvertPixelFromFloat<RAW>(pDst, srcPixel); break; - default: - SWR_INVALID("Invalid format: %d", format); - break; - } -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp b/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp deleted file mode 100644 index 3a19bbac70e..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file InitMemory.cpp -* -* @brief Provide access to tiles table initialization functions -* -******************************************************************************/ - -#include "memory/InitMemory.h" -#include "memory/LoadTile.h" -#include "memory/StoreTile.h" -#include "InitMemory.h" - -void InitSimLoadTilesTable(); -void InitSimStoreTilesTable(); -void InitSimClearTilesTable(); - -void InitTilesTable() -{ - InitSimLoadTilesTable(); - InitSimStoreTilesTable(); - InitSimClearTilesTable(); -} - - -void SwrGetTileIterface(SWR_TILE_INTERFACE &out_funcs) -{ - out_funcs.pfnSwrLoadHotTile = SwrLoadHotTile; - out_funcs.pfnSwrStoreHotTileToSurface = SwrStoreHotTileToSurface; -}
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.h b/src/gallium/drivers/swr/rasterizer/memory/InitMemory.h deleted file mode 100644 index a3ed7b3cbdb..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.h +++ /dev/null @@ -1,83 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file InitMemory.h -* -* @brief Provide access to tiles table initialization functions -* -******************************************************************************/ - -#pragma once - -#include "common/os.h" -#include "memory/SurfaceState.h" - -////////////////////////////////////////////////////////////////////////// -/// @brief Loads a full hottile from a render surface -/// @param hPrivateContext - Handle to private DC -/// @param dstFormat - Format for hot tile. -/// @param renderTargetIndex - Index to src render target -/// @param x, y - Coordinates to raster tile. -/// @param pDstHotTile - Pointer to Hot Tile -SWR_FUNC(void, - SwrLoadHotTile, - HANDLE hWorkerPrivateData, - const SWR_SURFACE_STATE* pSrcSurface, - BucketManager* pBucketManager, - SWR_FORMAT dstFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, - uint32_t y, - uint32_t renderTargetArrayIndex, - uint8_t* pDstHotTile); - -////////////////////////////////////////////////////////////////////////// -/// @brief Deswizzles and stores a full hottile to a render surface -/// @param hPrivateContext - Handle to private DC -/// @param srcFormat - Format for hot tile. -/// @param renderTargetIndex - Index to destination render target -/// @param x, y - Coordinates to raster tile. -/// @param pSrcHotTile - Pointer to Hot Tile -SWR_FUNC(void, - SwrStoreHotTileToSurface, - HANDLE hWorkerPrivateData, - SWR_SURFACE_STATE* pDstSurface, - BucketManager* pBucketManager, - SWR_FORMAT srcFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, - uint32_t y, - uint32_t renderTargetArrayIndex, - uint8_t* pSrcHotTile); - -struct SWR_TILE_INTERFACE { - PFNSwrLoadHotTile pfnSwrLoadHotTile; - PFNSwrStoreHotTileToSurface pfnSwrStoreHotTileToSurface; -}; - -extern "C" -{ - SWR_VISIBLE void SWR_API InitTilesTable(); - - typedef void(SWR_API* PFNSwrGetTileInterface)(SWR_TILE_INTERFACE& out_funcs); - SWR_VISIBLE void SWR_API SwrGetTileIterface(SWR_TILE_INTERFACE &out_funcs); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp deleted file mode 100644 index a26d45d130f..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp +++ /dev/null @@ -1,157 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file LoadTile.cpp -* -* @brief Functionality for Load -* -******************************************************************************/ -#include "LoadTile.h" - -// on demand buckets for load tiles -static std::vector<int> sBuckets(NUM_SWR_FORMATS, -1); -static std::mutex sBucketMutex; - -////////////////////////////////////////////////////////////////////////// -/// @brief Loads a full hottile from a render surface -/// @param hPrivateContext - Handle to private DC -/// @param dstFormat - Format for hot tile. -/// @param renderTargetIndex - Index to src render target -/// @param x, y - Coordinates to raster tile. -/// @param pDstHotTile - Pointer to Hot Tile -void SwrLoadHotTile( - HANDLE hWorkerPrivateData, - const SWR_SURFACE_STATE *pSrcSurface, - BucketManager* pBucketMgr, - SWR_FORMAT dstFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, - uint8_t *pDstHotTile) -{ - PFN_LOAD_TILES pfnLoadTiles = NULL; - - // don't need to load null surfaces - if (pSrcSurface->type == SURFACE_NULL) - { - return; - } - - // force 0 if requested renderTargetArrayIndex is OOB - if (renderTargetArrayIndex >= pSrcSurface->depth) - { - renderTargetArrayIndex = 0; - } - - if (renderTargetIndex < SWR_ATTACHMENT_DEPTH) - { - switch (pSrcSurface->tileMode) - { - case SWR_TILE_NONE: - pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_NONE[pSrcSurface->format]; - break; - case SWR_TILE_MODE_YMAJOR: - pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format]; - break; - case SWR_TILE_MODE_XMAJOR: - pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[pSrcSurface->format]; - break; - case SWR_TILE_MODE_WMAJOR: - SWR_ASSERT(pSrcSurface->format == R8_UINT); - pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load; - break; - default: - SWR_INVALID("Unsupported tiling mode"); - break; - } - } - else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH) - { - // Currently depth can map to linear and tile-y. - switch (pSrcSurface->tileMode) - { - case SWR_TILE_NONE: - pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_NONE[pSrcSurface->format]; - break; - case SWR_TILE_MODE_YMAJOR: - pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format]; - break; - default: - SWR_INVALID("Unsupported tiling mode"); - break; - } - } - else - { - SWR_ASSERT(renderTargetIndex == SWR_ATTACHMENT_STENCIL); - SWR_ASSERT(pSrcSurface->format == R8_UINT); - switch (pSrcSurface->tileMode) - { - case SWR_TILE_NONE: - pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_NONE, 8>, R8_UINT, R8_UINT>::Load; - break; - case SWR_TILE_MODE_WMAJOR: - pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load; - break; - default: - SWR_INVALID("Unsupported tiling mode"); - break; - } - } - - if (pfnLoadTiles == nullptr) - { - SWR_INVALID("Unsupported format for load tile"); - return; - } - - // Load a macro tile. -#ifdef KNOB_ENABLE_RDTSC - if (sBuckets[pSrcSurface->format] == -1) - { - // guard sBuckets update since storetiles is called by multiple threads - sBucketMutex.lock(); - if (sBuckets[pSrcSurface->format] == -1) - { - const SWR_FORMAT_INFO& info = GetFormatInfo(pSrcSurface->format); - BUCKET_DESC desc{ info.name, "", false, 0xffffffff }; - sBuckets[pSrcSurface->format] = pBucketMgr->RegisterBucket(desc); - } - sBucketMutex.unlock(); - } -#endif - -#ifdef KNOB_ENABLE_RDTSC - pBucketMgr->StartBucket(sBuckets[pSrcSurface->format]); -#endif - pfnLoadTiles(pSrcSurface, pDstHotTile, x, y, renderTargetArrayIndex); -#ifdef KNOB_ENABLE_RDTSC - pBucketMgr->StopBucket(sBuckets[pSrcSurface->format]); -#endif -} - - -void InitSimLoadTilesTable() -{ - InitLoadTilesTable_Linear(); - InitLoadTilesTable_XMajor(); - InitLoadTilesTable_YMajor(); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h deleted file mode 100644 index f74c3fdf4b0..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h +++ /dev/null @@ -1,354 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file LoadTile.h -* -* @brief Functionality for Load -* -******************************************************************************/ -#include "common/os.h" -#include "common/formats.h" -#include "core/context.h" -#include "core/rdtsc_core.h" -#include "memory/TilingFunctions.h" -#include "memory/tilingtraits.h" -#include "memory/Convert.h" - -typedef void(*PFN_LOAD_TILES)(const SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t); -typedef void(*PFN_LOAD_RASTER_TILES)(const SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t, uint32_t); - -////////////////////////////////////////////////////////////////////////// -/// Load Raster Tile Function Tables. -////////////////////////////////////////////////////////////////////////// -extern PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; -extern PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; - -extern PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; -extern PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS]; - -extern PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; - -void InitLoadTilesTable_Linear(); -void InitLoadTilesTable_XMajor(); -void InitLoadTilesTable_YMajor(); - -////////////////////////////////////////////////////////////////////////// -/// LoadRasterTile -////////////////////////////////////////////////////////////////////////// -template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct LoadRasterTile -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from hot tile source which is always float. - /// @param pSrc - Pointer to raster tile. - /// @param x, y - Coordinates to raster tile. - /// @param output - output color - INLINE static void SetSwizzledDstColor( - const float srcColor[4], - uint32_t x, uint32_t y, - uint8_t* pDst) - { - typedef SimdTile_16<DstFormat, SrcFormat> SimdT; - - SimdT* pDstSimdTiles = (SimdT*)pDst; - - // Compute which simd tile we're accessing within 8x8 tile. - // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. - uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM); - - SimdT* pSimdTile = &pDstSimdTiles[simdIndex]; - - uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM); - - pSimdTile->SetSwizzledColor(simdOffset, srcColor); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Loads an 8x8 raster tile from the src surface. - /// @param pSrcSurface - Src surface state - /// @param pDst - Destination hot tile pointer - /// @param x, y - Coordinates to raster tile. - INLINE static void Load( - const SWR_SURFACE_STATE* pSrcSurface, - uint8_t* pDst, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. - { - uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod; - uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod; - - // For each raster tile pixel (rx, ry) - for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) - { - for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) - { - if (((x + rx) < lodWidth) && - ((y + ry) < lodHeight)) - { - uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress<false, true>(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex, - pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum, - pSrcSurface->lod, pSrcSurface); - - float srcColor[4]; - ConvertPixelToFloat<SrcFormat>(srcColor, pSrc); - - // store pixel to hottile - SetSwizzledDstColor(srcColor, rx, ry, pDst); - } - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// LoadMacroTile - Loads a macro tile which consists of raster tiles. -////////////////////////////////////////////////////////////////////////// -template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct LoadMacroTile -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Load a macrotile to the destination surface. - /// @param pSrc - Pointer to macro tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to macro tile - static void Load( - const SWR_SURFACE_STATE* pSrcSurface, - uint8_t *pDstHotTile, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) - { - PFN_LOAD_RASTER_TILES loadRasterTileFn; - loadRasterTileFn = LoadRasterTile<TTraits, SrcFormat, DstFormat>::Load; - - // Load each raster tile from the hot tile to the destination surface. - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++) - { - loadRasterTileFn(pSrcSurface, pDstHotTile, (x + col), (y + row), sampleNum, renderTargetArrayIndex); - pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<DstFormat>::bpp / 8); - } - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// InitLoadTileColorTable - Helper function for setting up the tables. -template<SWR_TILE_MODE TTileMode> -static INLINE void InitLoadTileColorTable(PFN_LOAD_TILES (&table)[NUM_SWR_FORMATS]) -{ - memset(table, 0, sizeof(table)); - - table[R32G32B32A32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R32G32B32A32_SINT] = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load; - table[R32G32B32A32_UINT] = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load; - table[R32G32B32X32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R32G32B32A32_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R32G32B32A32_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_USCALED, R32G32B32A32_FLOAT>::Load; - table[R32G32B32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 96>, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R32G32B32_SINT] = LoadMacroTile<TilingTraits<TTileMode, 96>, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load; - table[R32G32B32_UINT] = LoadMacroTile<TilingTraits<TTileMode, 96>, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load; - table[R32G32B32_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 96>, R32G32B32_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R32G32B32_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 96>, R32G32B32_USCALED, R32G32B32A32_FLOAT>::Load; - table[R16G16B16A16_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load; - table[R16G16B16A16_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load; - table[R16G16B16A16_SINT] = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load; - table[R16G16B16A16_UINT] = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load; - table[R16G16B16A16_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R32G32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 64>, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R32G32_SINT] = LoadMacroTile<TilingTraits<TTileMode, 64>, R32G32_SINT, R32G32B32A32_FLOAT>::Load; - table[R32G32_UINT] = LoadMacroTile<TilingTraits<TTileMode, 64>, R32G32_UINT, R32G32B32A32_FLOAT>::Load; - table[R32_FLOAT_X8X24_TYPELESS] = LoadMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT_X8X24_TYPELESS, R32G32B32A32_FLOAT>::Load; - table[X32_TYPELESS_G8X24_UINT] = LoadMacroTile<TilingTraits<TTileMode, 64>, X32_TYPELESS_G8X24_UINT, R32G32B32A32_FLOAT>::Load; - table[L32A32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 64>, L32A32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R16G16B16X16_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load; - table[R16G16B16X16_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load; - table[L32X32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 64>, L32X32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[I32X32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 64>, I32X32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R16G16B16A16_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R16G16B16A16_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_USCALED, R32G32B32A32_FLOAT>::Load; - table[R32G32_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 64>, R32G32_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R32G32_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 64>, R32G32_USCALED, R32G32B32A32_FLOAT>::Load; - table[B8G8R8A8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load; - table[B8G8R8A8_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 32>, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[R10G10B10A2_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load; - table[R10G10B10A2_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[R10G10B10A2_UINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load; - table[R8G8B8A8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load; - table[R8G8B8A8_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[R8G8B8A8_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load; - table[R8G8B8A8_SINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load; - table[R8G8B8A8_UINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load; - table[R16G16_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_UNORM, R32G32B32A32_FLOAT>::Load; - table[R16G16_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_SNORM, R32G32B32A32_FLOAT>::Load; - table[R16G16_SINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_SINT, R32G32B32A32_FLOAT>::Load; - table[R16G16_UINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_UINT, R32G32B32A32_FLOAT>::Load; - table[R16G16_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load; - table[B10G10R10A2_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load; - table[B10G10R10A2_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[R11G11B10_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R10G10B10_FLOAT_A2_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10_FLOAT_A2_UNORM, R32G32B32A32_FLOAT>::Load; - table[R32_SINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_SINT, R32G32B32A32_FLOAT>::Load; - table[R32_UINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_UINT, R32G32B32A32_FLOAT>::Load; - table[R32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R24_UNORM_X8_TYPELESS] = LoadMacroTile<TilingTraits<TTileMode, 32>, R24_UNORM_X8_TYPELESS, R32G32B32A32_FLOAT>::Load; - table[X24_TYPELESS_G8_UINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, X24_TYPELESS_G8_UINT, R32G32B32A32_FLOAT>::Load; - table[L32_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, L32_UNORM, R32G32B32A32_FLOAT>::Load; - table[L16A16_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, L16A16_UNORM, R32G32B32A32_FLOAT>::Load; - table[I24X8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, I24X8_UNORM, R32G32B32A32_FLOAT>::Load; - table[L24X8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, L24X8_UNORM, R32G32B32A32_FLOAT>::Load; - table[I32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 32>, I32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[L32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 32>, L32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[A32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 32>, A32_FLOAT, R32G32B32A32_FLOAT>::Load; - table[B8G8R8X8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load; - table[B8G8R8X8_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 32>, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[R8G8B8X8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load; - table[R8G8B8X8_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[R9G9B9E5_SHAREDEXP] = LoadMacroTile<TilingTraits<TTileMode, 32>, R9G9B9E5_SHAREDEXP, R32G32B32A32_FLOAT>::Load; - table[B10G10R10X2_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load; - table[L16A16_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 32>, L16A16_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R10G10B10X2_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10X2_USCALED, R32G32B32A32_FLOAT>::Load; - table[R8G8B8A8_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R8G8B8A8_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_USCALED, R32G32B32A32_FLOAT>::Load; - table[R16G16_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R16G16_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_USCALED, R32G32B32A32_FLOAT>::Load; - table[R32_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R32_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_USCALED, R32G32B32A32_FLOAT>::Load; - table[B5G6R5_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load; - table[B5G6R5_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[B5G5R5A1_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load; - table[B5G5R5A1_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[B4G4R4A4_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load; - table[B4G4R4A4_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 16>, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[R8G8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_UNORM, R32G32B32A32_FLOAT>::Load; - table[R8G8_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_SNORM, R32G32B32A32_FLOAT>::Load; - table[R8G8_SINT] = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_SINT, R32G32B32A32_FLOAT>::Load; - table[R8G8_UINT] = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_UINT, R32G32B32A32_FLOAT>::Load; - table[R16_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_UNORM, R32G32B32A32_FLOAT>::Load; - table[R16_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_SNORM, R32G32B32A32_FLOAT>::Load; - table[R16_SINT] = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_SINT, R32G32B32A32_FLOAT>::Load; - table[R16_UINT] = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_UINT, R32G32B32A32_FLOAT>::Load; - table[R16_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_FLOAT, R32G32B32A32_FLOAT>::Load; - table[I16_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, I16_UNORM, R32G32B32A32_FLOAT>::Load; - table[L16_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, L16_UNORM, R32G32B32A32_FLOAT>::Load; - table[A16_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, A16_UNORM, R32G32B32A32_FLOAT>::Load; - table[L8A8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, L8A8_UNORM, R32G32B32A32_FLOAT>::Load; - table[I16_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 16>, I16_FLOAT, R32G32B32A32_FLOAT>::Load; - table[L16_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 16>, L16_FLOAT, R32G32B32A32_FLOAT>::Load; - table[A16_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 16>, A16_FLOAT, R32G32B32A32_FLOAT>::Load; - table[L8A8_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 16>, L8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[B5G5R5X1_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load; - table[B5G5R5X1_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[R8G8_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R8G8_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_USCALED, R32G32B32A32_FLOAT>::Load; - table[R16_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R16_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_USCALED, R32G32B32A32_FLOAT>::Load; - table[A1B5G5R5_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, A1B5G5R5_UNORM, R32G32B32A32_FLOAT>::Load; - table[A4B4G4R4_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, A4B4G4R4_UNORM, R32G32B32A32_FLOAT>::Load; - table[L8A8_UINT] = LoadMacroTile<TilingTraits<TTileMode, 16>, L8A8_UINT, R32G32B32A32_FLOAT>::Load; - table[L8A8_SINT] = LoadMacroTile<TilingTraits<TTileMode, 16>, L8A8_SINT, R32G32B32A32_FLOAT>::Load; - table[R8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_UNORM, R32G32B32A32_FLOAT>::Load; - table[R8_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_SNORM, R32G32B32A32_FLOAT>::Load; - table[R8_SINT] = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_SINT, R32G32B32A32_FLOAT>::Load; - table[R8_UINT] = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R32G32B32A32_FLOAT>::Load; - table[A8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 8>, A8_UNORM, R32G32B32A32_FLOAT>::Load; - table[I8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 8>, I8_UNORM, R32G32B32A32_FLOAT>::Load; - table[L8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 8>, L8_UNORM, R32G32B32A32_FLOAT>::Load; - table[R8_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R8_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_USCALED, R32G32B32A32_FLOAT>::Load; - table[L8_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 8>, L8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[L8_UINT] = LoadMacroTile<TilingTraits<TTileMode, 8>, L8_UINT, R32G32B32A32_FLOAT>::Load; - table[L8_SINT] = LoadMacroTile<TilingTraits<TTileMode, 8>, L8_SINT, R32G32B32A32_FLOAT>::Load; - table[I8_UINT] = LoadMacroTile<TilingTraits<TTileMode, 8>, I8_UINT, R32G32B32A32_FLOAT>::Load; - table[I8_SINT] = LoadMacroTile<TilingTraits<TTileMode, 8>, I8_SINT, R32G32B32A32_FLOAT>::Load; - table[YCRCB_SWAPUVY] = LoadMacroTile<TilingTraits<TTileMode, 32>, YCRCB_SWAPUVY, R32G32B32A32_FLOAT>::Load; - table[BC1_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 64>, BC1_UNORM, R32G32B32A32_FLOAT>::Load; - table[BC2_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 128>, BC2_UNORM, R32G32B32A32_FLOAT>::Load; - table[BC3_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 128>, BC3_UNORM, R32G32B32A32_FLOAT>::Load; - table[BC4_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 64>, BC4_UNORM, R32G32B32A32_FLOAT>::Load; - table[BC5_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 128>, BC5_UNORM, R32G32B32A32_FLOAT>::Load; - table[BC1_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 64>, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[BC2_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 128>, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[BC3_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 128>, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[YCRCB_SWAPUV] = LoadMacroTile<TilingTraits<TTileMode, 32>, YCRCB_SWAPUV, R32G32B32A32_FLOAT>::Load; - table[R8G8B8_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load; - table[R8G8B8_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load; - table[R8G8B8_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R8G8B8_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_USCALED, R32G32B32A32_FLOAT>::Load; - table[BC4_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 64>, BC4_SNORM, R32G32B32A32_FLOAT>::Load; - table[BC5_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 128>, BC5_SNORM, R32G32B32A32_FLOAT>::Load; - table[R16G16B16_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load; - table[R16G16B16_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load; - table[R16G16B16_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load; - table[R16G16B16_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R16G16B16_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_USCALED, R32G32B32A32_FLOAT>::Load; - table[BC6H_SF16] = LoadMacroTile<TilingTraits<TTileMode, 128>, BC6H_SF16, R32G32B32A32_FLOAT>::Load; - table[BC7_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 128>, BC7_UNORM, R32G32B32A32_FLOAT>::Load; - table[BC7_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 128>, BC7_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[BC6H_UF16] = LoadMacroTile<TilingTraits<TTileMode, 128>, BC6H_UF16, R32G32B32A32_FLOAT>::Load; - table[R8G8B8_UNORM_SRGB] = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; - table[R16G16B16_UINT] = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load; - table[R16G16B16_SINT] = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load; - table[R10G10B10A2_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load; - table[R10G10B10A2_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_USCALED, R32G32B32A32_FLOAT>::Load; - table[R10G10B10A2_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_SSCALED, R32G32B32A32_FLOAT>::Load; - table[R10G10B10A2_SINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load; - table[B10G10R10A2_SNORM] = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load; - table[B10G10R10A2_USCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_USCALED, R32G32B32A32_FLOAT>::Load; - table[B10G10R10A2_SSCALED] = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_SSCALED, R32G32B32A32_FLOAT>::Load; - table[B10G10R10A2_UINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load; - table[B10G10R10A2_SINT] = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load; - table[R8G8B8_UINT] = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load; - table[R8G8B8_SINT] = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load; - table[RAW] = LoadMacroTile<TilingTraits<TTileMode, 8>, RAW, R32G32B32A32_FLOAT>::Load; -} - -////////////////////////////////////////////////////////////////////////// -/// InitLoadTileColorTable - Helper function for setting up the tables. -template<SWR_TILE_MODE TTileMode> -static INLINE void InitLoadTileDepthTable(PFN_LOAD_TILES(&table)[NUM_SWR_FORMATS]) -{ - memset(table, 0, sizeof(table)); - - table[R32_FLOAT] = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Load; - table[R32_FLOAT_X8X24_TYPELESS] = LoadMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT_X8X24_TYPELESS, R32_FLOAT>::Load; - table[R24_UNORM_X8_TYPELESS] = LoadMacroTile<TilingTraits<TTileMode, 32>, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load; - table[R16_UNORM] = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_UNORM, R32_FLOAT>::Load; -} - - -////////////////////////////////////////////////////////////////////////// -/// @brief Loads a full hottile from a render surface -/// @param hPrivateContext - Handle to private DC -/// @param dstFormat - Format for hot tile. -/// @param renderTargetIndex - Index to src render target -/// @param x, y - Coordinates to raster tile. -/// @param pDstHotTile - Pointer to Hot Tile -void SwrLoadHotTile( - HANDLE hWorkerPrivateData, - const SWR_SURFACE_STATE *pSrcSurface, - BucketManager* pBucketMgr, - SWR_FORMAT dstFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, - uint8_t *pDstHotTile); diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_Linear.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_Linear.cpp deleted file mode 100644 index 5f53b5b6b56..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_Linear.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file LoadTile.cpp -* -* @brief Functionality for Load -* -******************************************************************************/ -#include "LoadTile.h" - -PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; -PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; - -////////////////////////////////////////////////////////////////////////// -/// @brief Sets up tables for LoadTile -void InitLoadTilesTable_Linear() -{ - InitLoadTileColorTable<SWR_TILE_NONE>(sLoadTilesColorTable_SWR_TILE_NONE); - InitLoadTileDepthTable<SWR_TILE_NONE>(sLoadTilesDepthTable_SWR_TILE_NONE); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileX.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileX.cpp deleted file mode 100644 index 8e76655ff11..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileX.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file LoadTile.cpp -* -* @brief Functionality for Load -* -******************************************************************************/ -#include "LoadTile.h" - -PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS]; - -////////////////////////////////////////////////////////////////////////// -/// @brief Sets up tables for LoadTile -void InitLoadTilesTable_XMajor() -{ - InitLoadTileColorTable<SWR_TILE_MODE_XMAJOR>(sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileY.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileY.cpp deleted file mode 100644 index c136392eb78..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileY.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file LoadTile.cpp -* -* @brief Functionality for Load -* -******************************************************************************/ -#include "LoadTile.h" - -PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; -PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; - -////////////////////////////////////////////////////////////////////////// -/// @brief Sets up tables for LoadTile -void InitLoadTilesTable_YMajor() -{ - InitLoadTileColorTable<SWR_TILE_MODE_YMAJOR>(sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR); - InitLoadTileDepthTable<SWR_TILE_MODE_YMAJOR>(sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp deleted file mode 100644 index 9fee13a045a..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file StoreTile.cpp -* -* @brief Functionality for Store. -* -******************************************************************************/ -#include "StoreTile.h" -////////////////////////////////////////////////////////////////////////// -/// Store Raster Tile Function Tables. -////////////////////////////////////////////////////////////////////////// -PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; -PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; -PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; - -// on demand buckets for store tiles -static std::mutex sBucketMutex; -static std::vector<int32_t> sBuckets(NUM_SWR_FORMATS, -1); - -////////////////////////////////////////////////////////////////////////// -/// @brief Deswizzles and stores a full hottile to a render surface -/// @param hPrivateContext - Handle to private DC -/// @param srcFormat - Format for hot tile. -/// @param renderTargetIndex - Index to destination render target -/// @param x, y - Coordinates to raster tile. -/// @param pSrcHotTile - Pointer to Hot Tile -void SwrStoreHotTileToSurface( - HANDLE hWorkerPrivateData, - SWR_SURFACE_STATE *pDstSurface, - BucketManager* pBucketMgr, - SWR_FORMAT srcFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, - uint8_t *pSrcHotTile) -{ - if (pDstSurface->type == SURFACE_NULL) - { - return; - } - - // force 0 if requested renderTargetArrayIndex is OOB - if (renderTargetArrayIndex >= pDstSurface->depth) - { - renderTargetArrayIndex = 0; - } - - PFN_STORE_TILES pfnStoreTiles = nullptr; - - if (renderTargetIndex <= SWR_ATTACHMENT_COLOR7) - { - pfnStoreTiles = sStoreTilesTableColor[pDstSurface->tileMode][pDstSurface->format]; - } - else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH) - { - pfnStoreTiles = sStoreTilesTableDepth[pDstSurface->tileMode][pDstSurface->format]; - } - else - { - pfnStoreTiles = sStoreTilesTableStencil[pDstSurface->tileMode][pDstSurface->format]; - } - - if(nullptr == pfnStoreTiles) - { - SWR_INVALID("Invalid pixel format / tile mode for store tiles"); - return; - } - - // Store a macro tile -#ifdef KNOB_ENABLE_RDTSC - if (sBuckets[pDstSurface->format] == -1) - { - // guard sBuckets update since storetiles is called by multiple threads - sBucketMutex.lock(); - if (sBuckets[pDstSurface->format] == -1) - { - const SWR_FORMAT_INFO& info = GetFormatInfo(pDstSurface->format); - BUCKET_DESC desc{info.name, "", false, 0xffffffff}; - sBuckets[pDstSurface->format] = pBucketMgr->RegisterBucket(desc); - } - sBucketMutex.unlock(); - } -#endif - -#ifdef KNOB_ENABLE_RDTSC - pBucketMgr->StartBucket(sBuckets[pDstSurface->format]); -#endif - pfnStoreTiles(pSrcHotTile, pDstSurface, x, y, renderTargetArrayIndex); -#ifdef KNOB_ENABLE_RDTSC - pBucketMgr->StopBucket(sBuckets[pDstSurface->format]); -#endif - -} - - -////////////////////////////////////////////////////////////////////////// -/// @brief Sets up tables for StoreTile -void InitSimStoreTilesTable() -{ - memset(sStoreTilesTableColor, 0, sizeof(sStoreTilesTableColor)); - memset(sStoreTilesTableDepth, 0, sizeof(sStoreTilesTableDepth)); - - InitStoreTilesTable_Linear_1(); - InitStoreTilesTable_Linear_2(); - InitStoreTilesTable_TileX_1(); - InitStoreTilesTable_TileX_2(); - InitStoreTilesTable_TileY_1(); - InitStoreTilesTable_TileY_2(); - InitStoreTilesTable_TileW(); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h deleted file mode 100644 index 1b7698cc5b8..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h +++ /dev/null @@ -1,2051 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file StoreTile.h -* -* @brief Functionality for Store. -* -******************************************************************************/ -#pragma once - -#include "common/os.h" -#include "common/formats.h" -#include "core/context.h" -#include "core/rdtsc_core.h" -#include "core/format_conversion.h" - -#include "memory/TilingFunctions.h" -#include "memory/Convert.h" -#include "memory/SurfaceState.h" -#include "core/multisample.h" - -#include <array> -#include <sstream> - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -// Function pointer to different storing functions for color, depth, and stencil based on incoming formats. -typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t); - -////////////////////////////////////////////////////////////////////////// -/// Store Raster Tile Function Tables. -////////////////////////////////////////////////////////////////////////// -extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; -extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; -extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; - -void InitStoreTilesTable_Linear_1(); -void InitStoreTilesTable_Linear_2(); -void InitStoreTilesTable_TileX_1(); -void InitStoreTilesTable_TileX_2(); -void InitStoreTilesTable_TileY_1(); -void InitStoreTilesTable_TileY_2(); -void InitStoreTilesTable_TileW(); -void InitStoreTilesTable(); - -////////////////////////////////////////////////////////////////////////// -/// StorePixels -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <size_t PixelSize, size_t NumDests> -struct StorePixels -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels (32-bit pixel specialization) -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <> -struct StorePixels<8, 2> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) - { - // Each 4-pixel row is 4 bytes. - const uint16_t* pPixSrc = (const uint16_t*)pSrc; - - // Unswizzle from SWR-Z order - uint16_t* pRow = (uint16_t*)ppDsts[0]; - pRow[0] = pPixSrc[0]; - pRow[1] = pPixSrc[2]; - - pRow = (uint16_t*)ppDsts[1]; - pRow[0] = pPixSrc[1]; - pRow[1] = pPixSrc[3]; - } -}; - -template <> -struct StorePixels<8, 4> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) - { - // 8 x 2 bytes = 16 bytes, 16 pixels - const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc); - - uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts); - - // Unswizzle from SWR-Z order - ppDsts16[0][0] = pSrc16[0]; // 0 1 - ppDsts16[0][1] = pSrc16[2]; // 4 5 - - ppDsts16[1][0] = pSrc16[1]; // 2 3 - ppDsts16[1][1] = pSrc16[3]; // 6 7 - - ppDsts16[2][0] = pSrc16[4]; // 8 9 - ppDsts16[2][1] = pSrc16[6]; // C D - - ppDsts16[3][0] = pSrc16[5]; // A B - ppDsts16[3][1] = pSrc16[7]; // E F - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels (32-bit pixel specialization) -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <> -struct StorePixels<16, 2> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) - { - // Each 4-pixel row is 8 bytes. - const uint32_t* pPixSrc = (const uint32_t*)pSrc; - - // Unswizzle from SWR-Z order - uint32_t* pRow = (uint32_t*)ppDsts[0]; - pRow[0] = pPixSrc[0]; - pRow[1] = pPixSrc[2]; - - pRow = (uint32_t*)ppDsts[1]; - pRow[0] = pPixSrc[1]; - pRow[1] = pPixSrc[3]; - } -}; - -template <> -struct StorePixels<16, 4> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) - { - // 8 x 4 bytes = 32 bytes, 16 pixels - const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc); - - uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts); - - // Unswizzle from SWR-Z order - ppDsts32[0][0] = pSrc32[0]; // 0 1 - ppDsts32[0][1] = pSrc32[2]; // 4 5 - - ppDsts32[1][0] = pSrc32[1]; // 2 3 - ppDsts32[1][1] = pSrc32[3]; // 6 7 - - ppDsts32[2][0] = pSrc32[4]; // 8 9 - ppDsts32[2][1] = pSrc32[6]; // C D - - ppDsts32[3][0] = pSrc32[5]; // A B - ppDsts32[3][1] = pSrc32[7]; // E F - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels (32-bit pixel specialization) -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <> -struct StorePixels<32, 2> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) - { - // Each 4-pixel row is 16-bytes - simd4scalari *pZRow01 = (simd4scalari*)pSrc; - simd4scalari vQuad00 = SIMD128::load_si(pZRow01); - simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1); - - simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01); - simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01); - - SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00); - SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10); - } -}; - -template <> -struct StorePixels<32, 4> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) - { - // 4 x 16 bytes = 64 bytes, 16 pixels - const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc); - - simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts); - - // Unswizzle from SWR-Z order - simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3 - simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7 - simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B - simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F - - SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5 - SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7 - SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D - SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels (32-bit pixel specialization) -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <> -struct StorePixels<64, 4> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) - { - // Each 4-pixel row is 32 bytes. - const simd4scalari* pPixSrc = (const simd4scalari*)pSrc; - - // order of pointers match SWR-Z layout - simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0]; - *pvDsts[0] = pPixSrc[0]; - *pvDsts[1] = pPixSrc[1]; - *pvDsts[2] = pPixSrc[2]; - *pvDsts[3] = pPixSrc[3]; - } -}; - -template <> -struct StorePixels<64, 8> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) - { - // 8 x 16 bytes = 128 bytes, 16 pixels - const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc); - - simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts); - - // order of pointers match SWR-Z layout - *ppDsts128[0] = pSrc128[0]; // 0 1 - *ppDsts128[1] = pSrc128[1]; // 2 3 - *ppDsts128[2] = pSrc128[2]; // 4 5 - *ppDsts128[3] = pSrc128[3]; // 6 7 - *ppDsts128[4] = pSrc128[4]; // 8 9 - *ppDsts128[5] = pSrc128[5]; // A B - *ppDsts128[6] = pSrc128[6]; // C D - *ppDsts128[7] = pSrc128[7]; // E F - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels (32-bit pixel specialization) -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <> -struct StorePixels<128, 8> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) - { - // Each 4-pixel row is 64 bytes. - const simd4scalari* pPixSrc = (const simd4scalari*)pSrc; - - // Unswizzle from SWR-Z order - simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0]; - *pvDsts[0] = pPixSrc[0]; - *pvDsts[1] = pPixSrc[2]; - *pvDsts[2] = pPixSrc[1]; - *pvDsts[3] = pPixSrc[3]; - *pvDsts[4] = pPixSrc[4]; - *pvDsts[5] = pPixSrc[6]; - *pvDsts[6] = pPixSrc[5]; - *pvDsts[7] = pPixSrc[7]; - } -}; - -template <> -struct StorePixels<128, 16> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16]) - { - // 16 x 16 bytes = 256 bytes, 16 pixels - const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc); - - simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts); - - for (uint32_t i = 0; i < 16; i += 4) - { - *ppDsts128[i + 0] = pSrc128[i + 0]; - *ppDsts128[i + 1] = pSrc128[i + 2]; - *ppDsts128[i + 2] = pSrc128[i + 1]; - *ppDsts128[i + 3] = pSrc128[i + 3]; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct ConvertPixelsSOAtoAOS -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Converts a SIMD from the Hot Tile to the destination format - /// and converts from SOA to AOS. - /// @param pSrc - Pointer to raster tile. - /// @param pDst - Pointer to destination surface or deswizzling buffer. - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel - - OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES] = {0}; - OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES] = {0}; - - // Convert from SrcFormat --> DstFormat - simd16vector src; - LoadSOA<SrcFormat>(pSrc, src); - StoreSOA<DstFormat>(src, soaTile); - - // Convert from SOA --> AOS - FormatTraits<DstFormat>::TransposeT::Transpose_simd16(soaTile, aosTile); - - // Store data into destination - StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) -/// Specialization for no format conversion -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT Format> -struct ConvertPixelsSOAtoAOS<Format, Format> -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Converts a SIMD from the Hot Tile to the destination format - /// and converts from SOA to AOS. - /// @param pSrc - Pointer to raster tile. - /// @param pDst - Pointer to destination surface or deswizzling buffer. - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel - - OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; - - // Convert from SOA --> AOS - FormatTraits<Format>::TransposeT::Transpose_simd16(pSrc, aosTile); - - // Store data into destination - StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM -////////////////////////////////////////////////////////////////////////// -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Converts a SIMD from the Hot Tile to the destination format - /// and converts from SOA to AOS. - /// @param pSrc - Pointer to raster tile. - /// @param pDst - Pointer to destination surface or deswizzling buffer. - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; - static const SWR_FORMAT DstFormat = B5G6R5_UNORM; - - static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel - - OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; - - // Load hot-tile - simd16vector src, dst; - LoadSOA<SrcFormat>(pSrc, src); - - // deswizzle - dst.x = src[FormatTraits<DstFormat>::swizzle(0)]; - dst.y = src[FormatTraits<DstFormat>::swizzle(1)]; - dst.z = src[FormatTraits<DstFormat>::swizzle(2)]; - - // clamp - dst.x = Clamp<DstFormat>(dst.x, 0); - dst.y = Clamp<DstFormat>(dst.y, 1); - dst.z = Clamp<DstFormat>(dst.z, 2); - - // normalize - dst.x = Normalize<DstFormat>(dst.x, 0); - dst.y = Normalize<DstFormat>(dst.y, 1); - dst.z = Normalize<DstFormat>(dst.z, 2); - - // pack - simd16scalari packed = _simd16_castps_si(dst.x); - - SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5); - SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6); - - packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5)); - packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6)); - - // pack low 16 bits of each 32 bit lane to low 128 bits of dst - uint32_t *pPacked = (uint32_t*)&packed; - uint16_t *pAosTile = (uint16_t*)&aosTile[0]; - for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t) - { - *pAosTile++ = *pPacked++; - } - - // Store data into destination - StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) -////////////////////////////////////////////////////////////////////////// -template<> -struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS> -{ - static const SWR_FORMAT SrcFormat = R32_FLOAT; - static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Converts a SIMD from the Hot Tile to the destination format - /// and converts from SOA to AOS. - /// @param pSrc - Pointer to raster tile. - /// @param pDst - Pointer to destination surface or deswizzling buffer. - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); - - // clamp - const simd16scalar zero = _simd16_setzero_ps(); - const simd16scalar ones = _simd16_set1_ps(1.0f); - - comp = _simd16_max_ps(comp, zero); - comp = _simd16_min_ps(comp, ones); - - // normalize - comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); - - simd16scalari temp = _simd16_cvtps_epi32(comp); - - // swizzle - temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); - - // merge/store data into destination but don't overwrite the X8 bits - simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0])); - simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2])); - - simd16scalari dest = _simd16_setzero_si(); - - dest = _simd16_insert_si(dest, destlo, 0); - dest = _simd16_insert_si(dest, desthi, 1); - - simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF); - - dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp)); - - _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0)); - _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1)); - } -}; - -template<SWR_FORMAT DstFormat> -INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) -{ - // swizzle rgba -> bgra while we load - simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr - simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg - simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb - simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa - - // clamp - const simd16scalar zero = _simd16_setzero_ps(); - const simd16scalar ones = _simd16_set1_ps(1.0f); - - comp0 = _simd16_max_ps(comp0, zero); - comp0 = _simd16_min_ps(comp0, ones); - - comp1 = _simd16_max_ps(comp1, zero); - comp1 = _simd16_min_ps(comp1, ones); - - comp2 = _simd16_max_ps(comp2, zero); - comp2 = _simd16_min_ps(comp2, ones); - - comp3 = _simd16_max_ps(comp3, zero); - comp3 = _simd16_min_ps(comp3, ones); - - // gamma-correct only rgb - if (FormatTraits<DstFormat>::isSRGB) - { - comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); - comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); - comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); - } - - // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format - comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); - comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); - comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); - comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); - - // moving to 16 wide integer vector types - simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr - simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg - simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb - simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa - - // SOA to AOS conversion - src1 = _simd16_slli_epi32(src1, 8); - src2 = _simd16_slli_epi32(src2, 16); - src3 = _simd16_slli_epi32(src3, 24); - - simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F - - // de-swizzle conversion -#if 1 - simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B - simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F - - final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F - -#else - final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); - -#endif - // store 8x2 memory order: - // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } - // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } - _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0)); - _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1)); -} - -template<SWR_FORMAT DstFormat> -INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) -{ - static const uint32_t offset = sizeof(simdscalar); - - // swizzle rgba -> bgra while we load - simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr - simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg - simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb - simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa - - // clamp - vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); - vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); - - vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); - vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); - - vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); - vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); - - vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps()); - vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f)); - - if (FormatTraits<DstFormat>::isSRGB) - { - // Gamma-correct only rgb - vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); - vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); - vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); - } - - // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format - vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); - vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); - vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); - vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); - - // moving to 8 wide integer vector types - simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr - simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg - simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb - simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa - -#if KNOB_ARCH <= KNOB_ARCH_AVX - - // splitting into two sets of 4 wide integer vector types - // because AVX doesn't have instructions to support this operation at 8 wide - simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r - simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g - simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b - simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a - - simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r - simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g - simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b - simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a - - srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 - srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 - srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 - srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 - srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 - srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 - - srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr - srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00 - - srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr - srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00 - - srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr - srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr - - // unpack into rows that get the tiling order correct - simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr - simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0); - - simdscalari final = _mm256_castsi128_si256(vRow00); - final = _mm256_insertf128_si256(final, vRow10, 1); - -#else - - // logic is as above, only wider - src1 = _mm256_slli_si256(src1, 1); - src2 = _mm256_slli_si256(src2, 2); - src3 = _mm256_slli_si256(src3, 3); - - src0 = _mm256_or_si256(src0, src1); - src2 = _mm256_or_si256(src2, src3); - - simdscalari final = _mm256_or_si256(src0, src2); - - // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 - final = _mm256_permute4x64_epi64(final, 0xD8); -#endif - - _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final); -} - -template<SWR_FORMAT DstFormat> -INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) -{ - // swizzle rgba -> bgra while we load - simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr - simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg - simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb - - // clamp - const simd16scalar zero = _simd16_setzero_ps(); - const simd16scalar ones = _simd16_set1_ps(1.0f); - - comp0 = _simd16_max_ps(comp0, zero); - comp0 = _simd16_min_ps(comp0, ones); - - comp1 = _simd16_max_ps(comp1, zero); - comp1 = _simd16_min_ps(comp1, ones); - - comp2 = _simd16_max_ps(comp2, zero); - comp2 = _simd16_min_ps(comp2, ones); - - // gamma-correct only rgb - if (FormatTraits<DstFormat>::isSRGB) - { - comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); - comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); - comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); - } - - // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format - comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); - comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); - comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); - - // moving to 16 wide integer vector types - simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr - simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg - simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb - - // SOA to AOS conversion - src1 = _simd16_slli_epi32(src1, 8); - src2 = _simd16_slli_epi32(src2, 16); - - simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F - - // de-swizzle conversion -#if 1 - simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B - simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F - - final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F - -#else - final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); - -#endif - // store 8x2 memory order: - // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } - // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } - _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0)); - _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1)); -} - -template<SWR_FORMAT DstFormat> -INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) -{ - static const uint32_t offset = sizeof(simdscalar); - - // swizzle rgba -> bgra while we load - simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr - simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg - simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb - // clamp - vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); - vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); - - vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); - vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); - - vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); - vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); - - if (FormatTraits<DstFormat>::isSRGB) - { - // Gamma-correct only rgb - vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); - vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); - vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); - } - - // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format - vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); - vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); - vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); - - // moving to 8 wide integer vector types - simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr - simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg - simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb - -#if KNOB_ARCH <= KNOB_ARCH_AVX - - // splitting into two sets of 4 wide integer vector types - // because AVX doesn't have instructions to support this operation at 8 wide - simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r - simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g - simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b - - simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r - simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g - simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b - - srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 - srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 - srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 - srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 - - srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr - - srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr - - srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr - srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr - - // unpack into rows that get the tiling order correct - simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr - simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0); - - simdscalari final = _mm256_castsi128_si256(vRow00); - final = _mm256_insertf128_si256(final, vRow10, 1); - -#else - - // logic is as above, only wider - src1 = _mm256_slli_si256(src1, 1); - src2 = _mm256_slli_si256(src2, 2); - - src0 = _mm256_or_si256(src0, src1); - - simdscalari final = _mm256_or_si256(src0, src2); - - // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 - final = _mm256_permute4x64_epi64(final, 0xD8); - -#endif - - _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final); -} - -template<> -struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM> -{ - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM> -{ - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > -{ - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > -{ - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > -{ - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > -{ - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > -{ - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > -{ - template <size_t NumDests> - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StoreRasterTile -////////////////////////////////////////////////////////////////////////// -template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct StoreRasterTile -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from hot tile source which is always float. - /// @param pSrc - Pointer to raster tile. - /// @param x, y - Coordinates to raster tile. - /// @param output - output color - INLINE static void GetSwizzledSrcColor( - uint8_t* pSrc, - uint32_t x, uint32_t y, - float outputColor[4]) - { - typedef SimdTile_16<SrcFormat, DstFormat> SimdT; - - SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc); - - // Compute which simd tile we're accessing within 8x8 tile. - // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. - uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM); - - SimdT *pSimdTile = &pSrcSimdTiles[simdIndex]; - - uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM); - - pSimdTile->GetSwizzledColor(simdOffset, outputColor); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. - { - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - // For each raster tile pixel (rx, ry) - for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) - { - for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) - { - // Perform bounds checking. - if (((x + rx) < lodWidth) && - ((y + ry) < lodHeight)) - { - float srcColor[4]; - GetSwizzledSrcColor(pSrc, rx, ry, srcColor); - - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry), - pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, - sampleNum, pDstSurface->lod, pDstSurface); - { - ConvertPixelFromFloat<DstFormat>(pDst, srcColor); - } - } - } - } - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Resolves an 8x8 raster tile to the resolve destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - /// @param sampleOffset - Offset between adjacent multisamples - INLINE static void Resolve( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. - { - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - float oneOverNumSamples = 1.0f / pDstSurface->numSamples; - - // For each raster tile pixel (rx, ry) - for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) - { - for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) - { - // Perform bounds checking. - if (((x + rx) < lodWidth) && - ((y + ry) < lodHeight)) - { - // Sum across samples - float resolveColor[4] = {0}; - for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) - { - float sampleColor[4] = {0}; - uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum; - GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor); - resolveColor[0] += sampleColor[0]; - resolveColor[1] += sampleColor[1]; - resolveColor[2] += sampleColor[2]; - resolveColor[3] += sampleColor[3]; - } - - // Divide by numSamples to average - resolveColor[0] *= oneOverNumSamples; - resolveColor[1] *= oneOverNumSamples; - resolveColor[2] *= oneOverNumSamples; - resolveColor[3] *= oneOverNumSamples; - - // Use the resolve surface state - SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress; - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry), - pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex, - 0, pResolveSurface->lod, pResolveSurface); - { - ConvertPixelFromFloat<DstFormat>(pDst, resolveColor); - } - } - } - } - } - -}; - -template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat> -{}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; - const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; - - uint8_t* ppDsts[] = - { - pDst, // row 0, col 0 - pDst + pDstSurface->pitch, // row 1, col 0 - pDst + dx / 2, // row 0, col 1 - pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) - { - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - ppDsts[0] += dx; - ppDsts[1] += dx; - ppDsts[2] += dx; - ppDsts[3] += dx; - } - - ppDsts[0] += dy; - ppDsts[1] += dy; - ppDsts[2] += dy; - ppDsts[3] += dy; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; - const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; - - uint8_t* ppDsts[] = - { - pDst, // row 0, col 0 - pDst + pDstSurface->pitch, // row 1, col 0 - pDst + dx / 2, // row 0, col 1 - pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) - { - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - ppDsts[0] += dx; - ppDsts[1] += dx; - ppDsts[2] += dx; - ppDsts[3] += dx; - } - - ppDsts[0] += dy; - ppDsts[1] += dy; - ppDsts[2] += dy; - ppDsts[3] += dy; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; - const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; - - uint8_t* ppDsts[] = - { - pDst, // row 0, col 0 - pDst + pDstSurface->pitch, // row 1, col 0 - pDst + dx / 2, // row 0, col 1 - pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) - { - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - ppDsts[0] += dx; - ppDsts[1] += dx; - ppDsts[2] += dx; - ppDsts[3] += dx; - } - - ppDsts[0] += dy; - ppDsts[1] += dy; - ppDsts[2] += dy; - ppDsts[3] += dy; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; - static const size_t MAX_DST_COLUMN_BYTES = 16; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; - const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; - - // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) - static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets"); - - uint8_t *ppDsts[] = - { - pDst, // row 0, col 0 - pDst + pDstSurface->pitch, // row 1, col 0 - pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 - pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 - pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - // Raster tile width is same as simd16 tile width - static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) - { - ppDsts[i] += dy; - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; - static const size_t MAX_DST_COLUMN_BYTES = 16; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; - const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; - - // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) - static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets"); - - uint8_t* ppDsts[] = - { - pDst, // row 0, col 0 - pDst + pDstSurface->pitch, // row 1, col 0 - pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 - pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 - pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3 - pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4 - pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5 - pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6 - pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - // Raster tile width is same as simd16 tile width - static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) - { - ppDsts[i] += dy; - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 16; // 16B rows - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 4 8x2 simd tiles in an 8x8 raster tile. - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - uint8_t *ppDsts[] = - { - pDst, - pDst + DestRowWidthBytes, - pDst + DestRowWidthBytes / 4, - pDst + DestRowWidthBytes + DestRowWidthBytes / 4 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - // Raster tile width is same as simd16 tile width - static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - ppDsts[0] += dy; - ppDsts[1] += dy; - ppDsts[2] += dy; - ppDsts[3] += dy; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 16; // 16B rows - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 4 8x2 simd tiles in an 8x8 raster tile. - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - uint8_t *ppDsts[] = - { - pDst, - pDst + DestRowWidthBytes, - pDst + DestRowWidthBytes / 2, - pDst + DestRowWidthBytes + DestRowWidthBytes / 2 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - // Raster tile width is same as simd16 tile width - static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - ppDsts[0] += dy; - ppDsts[1] += dy; - ppDsts[2] += dy; - ppDsts[3] += dy; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 512; // 512B rows - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; - const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; - - uint8_t* ppDsts[] = - { - pDst, // row 0, col 0 - pDst + DestRowWidthBytes, // row 1, col 0 - pDst + dx / 2, // row 0, col 1 - pDst + DestRowWidthBytes + dx / 2 // row 1, col 1 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) - { - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - ppDsts[0] += dx; - ppDsts[1] += dx; - ppDsts[2] += dx; - ppDsts[3] += dx; - } - - ppDsts[0] += dy; - ppDsts[1] += dy; - ppDsts[2] += dy; - ppDsts[3] += dy; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 16; // 16B rows - static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 4 8x2 simd tiles in an 8x8 raster tile. - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) - const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - uint8_t *ppDsts[] = - { - pDst, // row 0, col 0 - pDst + DestRowWidthBytes, // row 1, col 0 - pDst + DestColumnBytes, // row 0, col 1 - pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - // Raster tile width is same as simd16 tile width - static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - ppDsts[0] += dy; - ppDsts[1] += dy; - ppDsts[2] += dy; - ppDsts[3] += dy; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 16; // 16B rows - static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 4 8x2 simd tiles in an 8x8 raster tile. - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) - const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - uint8_t *ppDsts[] = - { - pDst, // row 0, col 0 - pDst + DestRowWidthBytes, // row 1, col 0 - pDst + DestColumnBytes, // row 0, col 1 - pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 - pDst + DestColumnBytes * 2, // row 0, col 2 - pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 - pDst + DestColumnBytes * 3, // row 0, col 3 - pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - // Raster tile width is same as simd16 tile width - static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) - { - ppDsts[i] += dy; - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> -{ - typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 16; // 16B rows - static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 4 8x2 simd tiles in an 8x8 raster tile. - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) - const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - uint8_t *ppDsts[] = - { - pDst, // row 0, col 0 - pDst + DestRowWidthBytes, // row 1, col 0 - pDst + DestColumnBytes, // row 0, col 1 - pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 - pDst + DestColumnBytes * 2, // row 0, col 2 - pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 - pDst + DestColumnBytes * 3, // row 0, col 3 - pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3 - pDst + DestColumnBytes * 4, // row 0, col 4 - pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4 - pDst + DestColumnBytes * 5, // row 0, col 5 - pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5 - pDst + DestColumnBytes * 6, // row 0, col 6 - pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6 - pDst + DestColumnBytes * 7, // row 0, col 7 - pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7 - }; - - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) - { - // Raster tile width is same as simd16 tile width - static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1) - { - ppDsts[i] += dy; - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StoreMacroTile - Stores a macro tile which consists of raster tiles. -////////////////////////////////////////////////////////////////////////// -template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> -struct StoreMacroTile -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores a macrotile to the destination surface using safe implementation. - /// @param pSrc - Pointer to macro tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to macro tile - static void StoreGeneric( - uint8_t *pSrcHotTile, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) - { - PFN_STORE_TILES_INTERNAL pfnStore; - pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; - - // Store each raster tile from the hot tile to the destination surface. - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) - { - pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); - pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); - } - } - } - - } - - typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t); - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores a macrotile to the destination surface. - /// @param pSrc - Pointer to macro tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to macro tile - static void Store( - uint8_t *pSrcHotTile, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) - { - PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; - - for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) - { - size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>( - 0, - 0, - pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces - pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays - sampleNum, - pDstSurface->lod, - pDstSurface); - - // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear - bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || - (pDstSurface->bInterleavedSamples); - - pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; - } - - // Save original for pSrcHotTile resolve. - uint8_t *pResolveSrcHotTile = pSrcHotTile; - - // Store each raster tile from the hot tile to the destination surface. - for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) - { - pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); - pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); - } - } - } - - if (pDstSurface->xpAuxBaseAddress) - { - uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); - // Store each raster tile from the hot tile to the destination surface. - for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex); - pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples; - } - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// InitStoreTilesTable - Helper for setting up the tables. -template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT> -void InitStoreTilesTableColor_Half1( - PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT]) -{ - table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store; - table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store; - table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store; - table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store; - table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store; - table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store; - table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store; - table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store; - table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store; - table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store; - table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store; - table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store; - table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store; - table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store; - table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store; - table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store; - table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store; - table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store; - table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store; - table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store; - table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store; - table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store; - table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store; - table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store; - table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store; - table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store; - table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store; - table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store; - table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store; - table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric; - table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric; - table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric; - table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store; - table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store; - table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store; - table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store; - table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store; - table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store; - table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store; - table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store; - table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store; - table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store; - table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric; - table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric; - table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric; - table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric; - table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store; - table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store; - table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store; - table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric; - table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric; - table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store; - table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store; - table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store; - table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store; - table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store; -} - -template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT> -void InitStoreTilesTableColor_Half2( - PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT]) -{ - table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric; - table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric; - table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric; - table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store; - table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store; - table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store; - table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store; - table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store; - table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store; - table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store; - table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric; - table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric; - table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric; - table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric; - table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric; - table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store; - table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store; - table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store; - table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store; - table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store; - table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store; - table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store; - table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store; - table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store; - table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store; - table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store; - table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric; - table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric; - table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store; - table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store; - table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store; - table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store; - table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric; - table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric; - table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store; - table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store; - table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store; - table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store; - table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store; - table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store; - table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store; - table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store; - table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store; - table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store; - table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store; - table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store; - table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store; - table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store; - table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store; - table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store; - table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store; - table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store; - table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store; - table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric; - table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric; - table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric; - table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric; - table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric; - table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric; - table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric; - table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric; - table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric; - table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store; - table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store; -} - -////////////////////////////////////////////////////////////////////////// -/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. -template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT> -void InitStoreTilesTableDepth( - PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) -{ - table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store; - table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store; - table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store; - table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store; -} - -template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT> -void InitStoreTilesTableStencil( - PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) -{ - table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store; -} - - -////////////////////////////////////////////////////////////////////////// -/// @brief Deswizzles and stores a full hottile to a render surface -/// @param hPrivateContext - Handle to private DC -/// @param srcFormat - Format for hot tile. -/// @param renderTargetIndex - Index to destination render target -/// @param x, y - Coordinates to raster tile. -/// @param pSrcHotTile - Pointer to Hot Tile -void SwrStoreHotTileToSurface( - HANDLE hWorkerPrivateData, - SWR_SURFACE_STATE *pDstSurface, - BucketManager* pBucketMgr, - SWR_FORMAT srcFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, - uint8_t *pSrcHotTile); diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear.cpp deleted file mode 100644 index c72063f6f1d..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file StoreTile_Linear.cpp -* -* @brief Functionality for Store. -* -******************************************************************************/ -#include "StoreTile.h" - -void InitStoreTilesTable_Linear_1() -{ - InitStoreTilesTableColor_Half1<SWR_TILE_NONE>(sStoreTilesTableColor); - InitStoreTilesTableDepth<SWR_TILE_NONE>(sStoreTilesTableDepth); - InitStoreTilesTableStencil<SWR_TILE_NONE>(sStoreTilesTableStencil); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear2.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear2.cpp deleted file mode 100644 index 035e685e261..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear2.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file StoreTile_Linear.cpp -* -* @brief Functionality for Store. -* -******************************************************************************/ -#include "StoreTile.h" - -void InitStoreTilesTable_Linear_2() -{ - InitStoreTilesTableColor_Half2<SWR_TILE_NONE>(sStoreTilesTableColor); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileW.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileW.cpp deleted file mode 100644 index ee4d99d1da0..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileW.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file StoreTile_TileW.cpp -* -* @brief Functionality for Store. -* -******************************************************************************/ -#include "StoreTile.h" - -void InitStoreTilesTable_TileW() -{ - InitStoreTilesTableStencil<SWR_TILE_MODE_WMAJOR>(sStoreTilesTableStencil); - // special color hot tile -> 8-bit WMAJOR - sStoreTilesTableColor[SWR_TILE_MODE_WMAJOR][R8_UINT] = StoreMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store; -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX.cpp deleted file mode 100644 index 7f49a432e92..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file StoreTile_TIleX.cpp -* -* @brief Functionality for Store. -* -******************************************************************************/ -#include "StoreTile.h" - -void InitStoreTilesTable_TileX_1() -{ - InitStoreTilesTableColor_Half1<SWR_TILE_MODE_XMAJOR>(sStoreTilesTableColor); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX2.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX2.cpp deleted file mode 100644 index 7e36ebececb..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX2.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file StoreTile_TIleX.cpp -* -* @brief Functionality for Store. -* -******************************************************************************/ -#include "StoreTile.h" - -void InitStoreTilesTable_TileX_2() -{ - InitStoreTilesTableColor_Half2<SWR_TILE_MODE_XMAJOR>(sStoreTilesTableColor); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY.cpp deleted file mode 100644 index dade03f2523..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file StoreTile_TileY.cpp -* -* @brief Functionality for Store. -* -******************************************************************************/ -#include "StoreTile.h" - -void InitStoreTilesTable_TileY_1() -{ - InitStoreTilesTableColor_Half1<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableColor); - InitStoreTilesTableDepth<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableDepth); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY2.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY2.cpp deleted file mode 100644 index b3ac76759fd..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY2.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2016 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file StoreTile_TileY.cpp -* -* @brief Functionality for Store. -* -******************************************************************************/ -#include "StoreTile.h" - -void InitStoreTilesTable_TileY_2() -{ - InitStoreTilesTableColor_Half2<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableColor); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/SurfaceState.h b/src/gallium/drivers/swr/rasterizer/memory/SurfaceState.h deleted file mode 100644 index 6b1b78eee46..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/SurfaceState.h +++ /dev/null @@ -1,66 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2019 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file SurfaceState.h -* -* @brief Common definitions for surface state -* -******************************************************************************/ -#pragma once - -#include "core/state.h" - -////////////////////////////////////////////////////////////////////////// -/// SWR_SURFACE_STATE -////////////////////////////////////////////////////////////////////////// -struct SWR_SURFACE_STATE -{ - gfxptr_t xpBaseAddress; - SWR_SURFACE_TYPE type; // @llvm_enum - SWR_FORMAT format; // @llvm_enum - uint32_t width; - uint32_t height; - uint32_t depth; - uint32_t numSamples; - uint32_t samplePattern; - uint32_t pitch; - uint32_t qpitch; - uint32_t minLod; // for sampled surfaces, the most detailed LOD that can be accessed by sampler - uint32_t maxLod; // for sampled surfaces, the max LOD that can be accessed - float resourceMinLod; // for sampled surfaces, the most detailed fractional mip that can be - // accessed by sampler - uint32_t lod; // for render targets, the lod being rendered to - uint32_t arrayIndex; // for render targets, the array index being rendered to for arrayed surfaces - SWR_TILE_MODE tileMode; // @llvm_enum - uint32_t halign; - uint32_t valign; - uint32_t xOffset; - uint32_t yOffset; - - uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces - - gfxptr_t xpAuxBaseAddress; // Used for compression, append/consume counter, etc. - SWR_AUX_MODE auxMode; // @llvm_enum - - - bool bInterleavedSamples; // are MSAA samples stored interleaved or planar -};
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h deleted file mode 100644 index 90143718eb8..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h +++ /dev/null @@ -1,697 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file TilingFunctions.h -* -* @brief Tiling functions. -* -******************************************************************************/ -#pragma once - -#include "core/state.h" -#include "core/format_traits.h" -#include "memory/tilingtraits.h" -#include "memory/SurfaceState.h" - -#include <algorithm> - -#define MAX_NUM_LOD 15 - -#define GFX_ALIGN(x, a) (((x) + ((a) - 1)) - (((x) + ((a) - 1)) & ((a) - 1))) // Alt implementation with bitwise not (~) has issue with uint32 align used with 64-bit value, since ~'ed value will remain 32-bit. - -////////////////////////////////////////////////////////////////////////// -/// SimdTile SSE(2x2), AVX(4x2), or AVX-512(4x4?) -////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT HotTileFormat, SWR_FORMAT SrcOrDstFormat> -struct SimdTile -{ - // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa ) - float color[FormatTraits<HotTileFormat>::numComps][KNOB_SIMD_WIDTH]; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from simd. - /// @param index - linear index to color within simd. - /// @param outputColor - output color - INLINE void GetSwizzledColor( - uint32_t index, - float outputColor[4]) - { - // SOA pattern for 2x2 is a subset of 4x2. - // 0 1 4 5 - // 2 3 6 7 - // The offset converts pattern to linear -#if (SIMD_TILE_X_DIM == 4) - static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; -#elif (SIMD_TILE_X_DIM == 2) - static const uint32_t offset[] = { 0, 1, 2, 3 }; -#endif - - for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i) - { - outputColor[i] = this->color[FormatTraits<SrcOrDstFormat>::swizzle(i)][offset[index]]; - } - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from simd. - /// @param index - linear index to color within simd. - /// @param outputColor - output color - INLINE void SetSwizzledColor( - uint32_t index, - const float src[4]) - { - // SOA pattern for 2x2 is a subset of 4x2. - // 0 1 4 5 - // 2 3 6 7 - // The offset converts pattern to linear -#if (SIMD_TILE_X_DIM == 4) - static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; -#elif (SIMD_TILE_X_DIM == 2) - static const uint32_t offset[] = { 0, 1, 2, 3 }; -#endif - - // Only loop over the components needed for destination. - for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i) - { - this->color[i][offset[index]] = src[i]; - } - } -}; - -template<> -struct SimdTile <R8_UINT,R8_UINT> -{ - // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa ) - uint8_t color[FormatTraits<R8_UINT>::numComps][KNOB_SIMD_WIDTH]; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from simd. - /// @param index - linear index to color within simd. - /// @param outputColor - output color - INLINE void GetSwizzledColor( - uint32_t index, - float outputColor[4]) - { - // SOA pattern for 2x2 is a subset of 4x2. - // 0 1 4 5 - // 2 3 6 7 - // The offset converts pattern to linear -#if (SIMD_TILE_X_DIM == 4) - static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; -#elif (SIMD_TILE_X_DIM == 2) - static const uint32_t offset[] = { 0, 1, 2, 3 }; -#endif - - for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i) - { - uint32_t src = this->color[FormatTraits<R8_UINT>::swizzle(i)][offset[index]]; - outputColor[i] = *(float*)&src; - } - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from simd. - /// @param index - linear index to color within simd. - /// @param outputColor - output color - INLINE void SetSwizzledColor( - uint32_t index, - const float src[4]) - { - // SOA pattern for 2x2 is a subset of 4x2. - // 0 1 4 5 - // 2 3 6 7 - // The offset converts pattern to linear -#if (SIMD_TILE_X_DIM == 4) - static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; -#elif (SIMD_TILE_X_DIM == 2) - static const uint32_t offset[] = { 0, 1, 2, 3 }; -#endif - - // Only loop over the components needed for destination. - for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i) - { - this->color[i][offset[index]] = *(uint8_t*)&src[i]; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// SimdTile 8x2 for AVX-512 -////////////////////////////////////////////////////////////////////////// - -template<SWR_FORMAT HotTileFormat, SWR_FORMAT SrcOrDstFormat> -struct SimdTile_16 -{ - // SimdTile is SOA (e.g. rrrrrrrrrrrrrrrr gggggggggggggggg bbbbbbbbbbbbbbbb aaaaaaaaaaaaaaaa ) - float color[FormatTraits<HotTileFormat>::numComps][KNOB_SIMD16_WIDTH]; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from simd. - /// @param index - linear index to color within simd. - /// @param outputColor - output color - INLINE void GetSwizzledColor( - uint32_t index, - float outputColor[4]) - { - // SOA pattern for 8x2.. - // 0 1 4 5 8 9 C D - // 2 3 6 7 A B E F - // The offset converts pattern to linear - static const uint32_t offset[KNOB_SIMD16_WIDTH] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }; - - for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i) - { - outputColor[i] = this->color[FormatTraits<SrcOrDstFormat>::swizzle(i)][offset[index]]; - } - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from simd. - /// @param index - linear index to color within simd. - /// @param outputColor - output color - INLINE void SetSwizzledColor( - uint32_t index, - const float src[4]) - { - // SOA pattern for 8x2.. - // 0 1 4 5 8 9 C D - // 2 3 6 7 A B E F - // The offset converts pattern to linear - static const uint32_t offset[KNOB_SIMD16_WIDTH] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }; - - for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i) - { - this->color[i][offset[index]] = src[i]; - } - } -}; - -template<> -struct SimdTile_16 <R8_UINT, R8_UINT> -{ - // SimdTile is SOA (e.g. rrrrrrrrrrrrrrrr gggggggggggggggg bbbbbbbbbbbbbbbb aaaaaaaaaaaaaaaa ) - uint8_t color[FormatTraits<R8_UINT>::numComps][KNOB_SIMD16_WIDTH]; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from simd. - /// @param index - linear index to color within simd. - /// @param outputColor - output color - INLINE void GetSwizzledColor( - uint32_t index, - float outputColor[4]) - { - // SOA pattern for 8x2.. - // 0 1 4 5 8 9 C D - // 2 3 6 7 A B E F - // The offset converts pattern to linear - static const uint32_t offset[KNOB_SIMD16_WIDTH] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }; - - for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i) - { - uint32_t src = this->color[FormatTraits<R8_UINT>::swizzle(i)][offset[index]]; - outputColor[i] = *(float*)&src; - } - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from simd. - /// @param index - linear index to color within simd. - /// @param outputColor - output color - INLINE void SetSwizzledColor( - uint32_t index, - const float src[4]) - { - // SOA pattern for 8x2.. - // 0 1 4 5 8 9 C D - // 2 3 6 7 A B E F - // The offset converts pattern to linear - static const uint32_t offset[KNOB_SIMD16_WIDTH] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }; - - for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i) - { - this->color[i][offset[index]] = *(uint8_t*)&src[i]; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes lod offset for 1D surface at specified lod. -/// @param baseWidth - width of basemip (mip 0). -/// @param hAlign - horizontal alignment per miip, in texels -/// @param lod - lod index -/// @param offset - output offset. -INLINE void ComputeLODOffset1D( - const SWR_FORMAT_INFO& info, - uint32_t baseWidth, - uint32_t hAlign, - uint32_t lod, - uint32_t &offset) -{ - if (lod == 0) - { - offset = 0; - } - else - { - uint32_t curWidth = baseWidth; - // @note hAlign is already in blocks for compressed formats so upconvert - // so that we have the desired alignment post-divide. - if (info.isBC) - { - hAlign *= info.bcWidth; - } - - offset = GFX_ALIGN(curWidth, hAlign); - for (uint32_t l = 1; l < lod; ++l) - { - curWidth = std::max<uint32_t>(curWidth >> 1, 1U); - offset += GFX_ALIGN(curWidth, hAlign); - } - - if (info.isSubsampled || info.isBC) - { - offset /= info.bcWidth; - } - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes x lod offset for 2D surface at specified lod. -/// @param baseWidth - width of basemip (mip 0). -/// @param hAlign - horizontal alignment per mip, in texels -/// @param lod - lod index -/// @param offset - output offset. -INLINE void ComputeLODOffsetX( - const SWR_FORMAT_INFO& info, - uint32_t baseWidth, - uint32_t hAlign, - uint32_t lod, - uint32_t &offset) -{ - if (lod < 2) - { - offset = 0; - } - else - { - uint32_t curWidth = baseWidth; - // @note hAlign is already in blocks for compressed formats so upconvert - // so that we have the desired alignment post-divide. - if (info.isBC) - { - hAlign *= info.bcWidth; - } - - curWidth = std::max<uint32_t>(curWidth >> 1, 1U); - curWidth = GFX_ALIGN(curWidth, hAlign); - - if (info.isSubsampled || info.isBC) - { - curWidth /= info.bcWidth; - } - - offset = curWidth; - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes y lod offset for 2D surface at specified lod. -/// @param baseWidth - width of basemip (mip 0). -/// @param vAlign - vertical alignment per mip, in rows -/// @param lod - lod index -/// @param offset - output offset. -INLINE void ComputeLODOffsetY( - const SWR_FORMAT_INFO& info, - uint32_t baseHeight, - uint32_t vAlign, - uint32_t lod, - uint32_t &offset) -{ - if (lod == 0) - { - offset = 0; - } - else - { - offset = 0; - uint32_t mipHeight = baseHeight; - - // @note vAlign is already in blocks for compressed formats so upconvert - // so that we have the desired alignment post-divide. - if (info.isBC) - { - vAlign *= info.bcHeight; - } - - for (uint32_t l = 1; l <= lod; ++l) - { - uint32_t alignedMipHeight = GFX_ALIGN(mipHeight, vAlign); - offset += ((l != 2) ? alignedMipHeight : 0); - mipHeight = std::max<uint32_t>(mipHeight >> 1, 1U); - } - - if (info.isBC) - { - offset /= info.bcHeight; - } - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes 1D surface offset -/// @param x - offset from start of array slice at given lod. -/// @param array - array slice index -/// @param lod - lod index -/// @param pState - surface state -/// @param xOffsetBytes - output offset in bytes. -template<bool UseCachedOffsets> -INLINE void ComputeSurfaceOffset1D( - uint32_t x, - uint32_t array, - uint32_t lod, - const SWR_SURFACE_STATE *pState, - uint32_t &xOffsetBytes) -{ - const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); - uint32_t lodOffset; - - if (UseCachedOffsets) - { - lodOffset = pState->lodOffsets[0][lod]; - } - else - { - ComputeLODOffset1D(info, pState->width, pState->halign, lod, lodOffset); - } - - xOffsetBytes = (array * pState->qpitch + lodOffset + x) * info.Bpp; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Adjusts the array slice for legacy TileY MSAA -/// @param pState - surface state -/// @param array - array slice index -/// @param sampleNum - requested sample -INLINE void AdjustCoordsForMSAA(const SWR_SURFACE_STATE *pState, uint32_t& x, uint32_t& y, uint32_t& arrayIndex, uint32_t sampleNum) -{ - /// @todo: might want to templatize adjusting for sample slices when we support tileYS/tileYF. - if((pState->tileMode == SWR_TILE_MODE_YMAJOR || - pState->tileMode == SWR_TILE_MODE_WMAJOR) && - pState->bInterleavedSamples) - { - uint32_t newX, newY, newSampleX, newSampleY; - switch(pState->numSamples) - { - case 1: - newX = x; - newY = y; - newSampleX = newSampleY = 0; - break; - case 2: - { - assert(pState->type == SURFACE_2D); - static const uint32_t xMask = 0xFFFFFFFD; - static const uint32_t sampleMaskX = 0x1; - newX = pdep_u32(x, xMask); - newY = y; - newSampleX = pext_u32(sampleNum, sampleMaskX); - newSampleY = 0; - } - break; - case 4: - { - assert(pState->type == SURFACE_2D); - static const uint32_t mask = 0xFFFFFFFD; - static const uint32_t sampleMaskX = 0x1; - static const uint32_t sampleMaskY = 0x2; - newX = pdep_u32(x, mask); - newY = pdep_u32(y, mask); - newSampleX = pext_u32(sampleNum, sampleMaskX); - newSampleY = pext_u32(sampleNum, sampleMaskY); - } - break; - case 8: - { - assert(pState->type == SURFACE_2D); - static const uint32_t xMask = 0xFFFFFFF9; - static const uint32_t yMask = 0xFFFFFFFD; - static const uint32_t sampleMaskX = 0x5; - static const uint32_t sampleMaskY = 0x2; - newX = pdep_u32(x, xMask); - newY = pdep_u32(y, yMask); - newSampleX = pext_u32(sampleNum, sampleMaskX); - newSampleY = pext_u32(sampleNum, sampleMaskY); - } - break; - case 16: - { - assert(pState->type == SURFACE_2D); - static const uint32_t mask = 0xFFFFFFF9; - static const uint32_t sampleMaskX = 0x5; - static const uint32_t sampleMaskY = 0xA; - newX = pdep_u32(x, mask); - newY = pdep_u32(y, mask); - newSampleX = pext_u32(sampleNum, sampleMaskX); - newSampleY = pext_u32(sampleNum, sampleMaskY); - } - break; - default: - assert(0 && "Unsupported sample count"); - newX = newY = 0; - newSampleX = newSampleY = 0; - break; - } - x = newX | (newSampleX << 1); - y = newY | (newSampleY << 1); - } - else if(pState->tileMode == SWR_TILE_MODE_YMAJOR || - pState->tileMode == SWR_TILE_NONE) - { - uint32_t sampleShift; - switch(pState->numSamples) - { - case 1: - assert(sampleNum == 0); - sampleShift = 0; - break; - case 2: - assert(pState->type == SURFACE_2D); - sampleShift = 1; - break; - case 4: - assert(pState->type == SURFACE_2D); - sampleShift = 2; - break; - case 8: - assert(pState->type == SURFACE_2D); - sampleShift = 3; - break; - case 16: - assert(pState->type == SURFACE_2D); - sampleShift = 4; - break; - default: - assert(0 && "Unsupported sample count"); - sampleShift = 0; - break; - } - arrayIndex = (arrayIndex << sampleShift) | sampleNum; - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes 2D surface offset -/// @param x - horizontal offset from start of array slice and lod. -/// @param y - vertical offset from start of array slice and lod. -/// @param array - array slice index -/// @param lod - lod index -/// @param pState - surface state -/// @param xOffsetBytes - output x offset in bytes. -/// @param yOffsetRows - output y offset in bytes. -template<bool UseCachedOffsets> -INLINE void ComputeSurfaceOffset2D(uint32_t x, uint32_t y, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows) -{ - const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); - uint32_t lodOffsetX, lodOffsetY; - - if (UseCachedOffsets) - { - lodOffsetX = pState->lodOffsets[0][lod]; - lodOffsetY = pState->lodOffsets[1][lod]; - } - else - { - ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX); - ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY); - } - - AdjustCoordsForMSAA(pState, x, y, array, sampleNum); - xOffsetBytes = (x + lodOffsetX + pState->xOffset) * info.Bpp; - yOffsetRows = (array * pState->qpitch) + lodOffsetY + y + pState->yOffset; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes 3D surface offset -/// @param x - horizontal offset from start of array slice and lod. -/// @param y - vertical offset from start of array slice and lod. -/// @param z - depth offset from start of array slice and lod. -/// @param lod - lod index -/// @param pState - surface state -/// @param xOffsetBytes - output x offset in bytes. -/// @param yOffsetRows - output y offset in rows. -/// @param zOffsetSlices - output y offset in slices. -template<bool UseCachedOffsets> -INLINE void ComputeSurfaceOffset3D(uint32_t x, uint32_t y, uint32_t z, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows, uint32_t &zOffsetSlices) -{ - const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); - uint32_t lodOffsetX, lodOffsetY; - - if (UseCachedOffsets) - { - lodOffsetX = pState->lodOffsets[0][lod]; - lodOffsetY = pState->lodOffsets[1][lod]; - } - else - { - ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX); - ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY); - } - - xOffsetBytes = (x + lodOffsetX) * info.Bpp; - yOffsetRows = lodOffsetY + y; - zOffsetSlices = z; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Swizzles the linear x,y offsets depending on surface tiling mode -/// and returns final surface address -/// @param xOffsetBytes - x offset from base of surface in bytes -/// @param yOffsetRows - y offset from base of surface in rows -/// @param pState - pointer to the surface state -template<typename TTraits> -INLINE uint32_t ComputeTileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState) -{ - return ComputeOffset2D<TTraits>(pState->pitch, xOffsetBytes, yOffsetRows); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Swizzles the linear x,y offsets depending on surface tiling mode -/// and returns final surface address -/// @param xOffsetBytes - x offset from base of surface in bytes -/// @param yOffsetRows - y offset from base of surface in rows -/// @param pState - pointer to the surface state -template<typename TTraits> -INLINE uint32_t ComputeTileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState) -{ - return ComputeOffset3D<TTraits>(pState->qpitch, pState->pitch, xOffsetBytes, yOffsetRows, zOffsetSlices); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Swizzles the linear x,y offsets depending on surface tiling mode -/// and returns final surface address -/// @param xOffsetBytes - x offset from base of surface in bytes -/// @param yOffsetRows - y offset from base of surface in rows -/// @param pState - pointer to the surface state -INLINE -uint32_t TileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState) -{ - switch (pState->tileMode) - { - case SWR_TILE_NONE: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, pState); - case SWR_TILE_SWRZ: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, pState); - case SWR_TILE_MODE_XMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_XMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState); - case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, pState); - case SWR_TILE_MODE_WMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_WMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState); - default: SWR_INVALID("Unsupported tiling mode"); - } - return 0; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Swizzles the linear x,y,z offsets depending on surface tiling mode -/// and returns final surface address -/// @param xOffsetBytes - x offset from base of surface in bytes -/// @param yOffsetRows - y offset from base of surface in rows -/// @param zOffsetSlices - z offset from base of surface in slices -/// @param pState - pointer to the surface state -INLINE -uint32_t TileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState) -{ - switch (pState->tileMode) - { - case SWR_TILE_NONE: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); - case SWR_TILE_SWRZ: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); - case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); - default: SWR_INVALID("Unsupported tiling mode"); - } - return 0; -} - -template<bool UseCachedOffsets> -INLINE -uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState) -{ - uint32_t offsetX = 0, offsetY = 0, offsetZ = 0; - switch (pState->type) - { - case SURFACE_BUFFER: - case SURFACE_STRUCTURED_BUFFER: - offsetX = x * pState->pitch; - return offsetX; - break; - case SURFACE_1D: - ComputeSurfaceOffset1D<UseCachedOffsets>(x, array, lod, pState, offsetX); - return TileSwizzle2D(offsetX, 0, pState); - break; - case SURFACE_2D: - ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY); - return TileSwizzle2D(offsetX, offsetY, pState); - case SURFACE_3D: - ComputeSurfaceOffset3D<UseCachedOffsets>(x, y, z, lod, pState, offsetX, offsetY, offsetZ); - return TileSwizzle3D(offsetX, offsetY, offsetZ, pState); - break; - case SURFACE_CUBE: - ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY); - return TileSwizzle2D(offsetX, offsetY, pState); - break; - default: SWR_INVALID("Unsupported format"); - } - - return 0; -} - -typedef void*(*PFN_COMPUTESURFADDR)(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, const SWR_SURFACE_STATE*); - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes surface address at the given location and lod -/// @param x - x location in pixels -/// @param y - y location in rows -/// @param z - z location for 3D surfaces -/// @param array - array slice for 1D and 2D surfaces -/// @param lod - level of detail -/// @param pState - pointer to the surface state -template<bool UseCachedOffsets, bool IsRead> -INLINE -void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState) -{ - return (void*)(pState->xpBaseAddress + ComputeSurfaceOffset<UseCachedOffsets>(x, y, z, array, sampleNum, lod, pState)); -} diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h deleted file mode 100644 index c2a87d85dd1..00000000000 --- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h +++ /dev/null @@ -1,207 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file tilingtraits.h -* -* @brief Tiling traits. -* -******************************************************************************/ -#pragma once - -#include "core/state.h" -#include "common/intrin.h" - -template<SWR_TILE_MODE mode, int> -struct TilingTraits -{ - static const SWR_TILE_MODE TileMode{ mode }; - static UINT GetCu() { SWR_NOT_IMPL; return 0; } - static UINT GetCv() { SWR_NOT_IMPL; return 0; } - static UINT GetCr() { SWR_NOT_IMPL; return 0; } - static UINT GetTileIDShift() { SWR_NOT_IMPL; return 0; } - - /// @todo correct pdep shifts for all rastertile dims. Unused for now - static UINT GetPdepX() { SWR_NOT_IMPL; return 0x37; } - static UINT GetPdepY() { SWR_NOT_IMPL; return 0xC8; } -}; - -template<int X> struct TilingTraits <SWR_TILE_NONE, X> -{ - static const SWR_TILE_MODE TileMode{ SWR_TILE_NONE }; - static UINT GetCu() { return 0; } - static UINT GetCv() { return 0; } - static UINT GetCr() { return 0; } - static UINT GetTileIDShift() { return 0; } - static UINT GetPdepX() { return 0x00; } - static UINT GetPdepY() { return 0x00; } -}; - -template<> struct TilingTraits <SWR_TILE_SWRZ, 8> -{ - static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; - static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT; } - static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } - static UINT GetCr() { return 0; } - static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT; } - - /// @todo correct pdep shifts for all rastertile dims. Unused for now - static UINT GetPdepX() { SWR_NOT_IMPL; return 0x00; } - static UINT GetPdepY() { SWR_NOT_IMPL; return 0x00; } -}; - -template<> struct TilingTraits <SWR_TILE_SWRZ, 32> -{ - static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; - static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 2; } - static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } - static UINT GetCr() { return 0; } - static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 2; } - - static UINT GetPdepX() { return 0x37; } - static UINT GetPdepY() { return 0xC8; } -}; - -template<> struct TilingTraits <SWR_TILE_SWRZ, 128> -{ - static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; - static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 4; } - static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } - static UINT GetCr() { return 0; } - static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 4; } - - /// @todo correct pdep shifts for all rastertile dims. Unused for now - static UINT GetPdepX() { SWR_NOT_IMPL; return 0x37; } - static UINT GetPdepY() { SWR_NOT_IMPL; return 0xC8; } -}; - -// y-major tiling layout unaffected by element size -template<int X> struct TilingTraits <SWR_TILE_MODE_YMAJOR, X> -{ - static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_YMAJOR }; - static UINT GetCu() { return 7; } - static UINT GetCv() { return 5; } - static UINT GetCr() { return 0; } - static UINT GetTileIDShift() { return 12; } - - static UINT GetPdepX() { return 0xe0f; } - static UINT GetPdepY() { return 0x1f0; } -}; - -// x-major tiling layout unaffected by element size -template<int X> struct TilingTraits <SWR_TILE_MODE_XMAJOR, X> -{ - static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_XMAJOR }; - static UINT GetCu() { return 9; } - static UINT GetCv() { return 3; } - static UINT GetCr() { return 0; } - static UINT GetTileIDShift() { return 12; } - - static UINT GetPdepX() { return 0x1ff; } - static UINT GetPdepY() { return 0xe00; } -}; - -template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X> -{ - static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_WMAJOR }; - static UINT GetCu() { return 6; } - static UINT GetCv() { return 6; } - static UINT GetCr() { return 0; } - static UINT GetTileIDShift() { return 12; } - - static UINT GetPdepX() { return 0xe15; } - static UINT GetPdepY() { return 0x1ea; } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes the tileID for 2D tiled surfaces -/// @param pitch - surface pitch in bytes -/// @param tileX - x offset in tiles -/// @param tileY - y offset in tiles -template<typename TTraits> -INLINE UINT ComputeTileOffset2D(UINT pitch, UINT tileX, UINT tileY) -{ - UINT tileID = tileY * (pitch >> TTraits::GetCu()) + tileX; - return tileID << TTraits::GetTileIDShift(); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes the tileID for 3D tiled surfaces -/// @param qpitch - surface qpitch in rows -/// @param pitch - surface pitch in bytes -/// @param tileX - x offset in tiles -/// @param tileY - y offset in tiles -/// @param tileZ - y offset in tiles -template<typename TTraits> -INLINE UINT ComputeTileOffset3D(UINT qpitch, UINT pitch, UINT tileX, UINT tileY, UINT tileZ) -{ - UINT tileID = (tileZ * (qpitch >> TTraits::GetCv()) + tileY) * (pitch >> TTraits::GetCu()) + tileX; - return tileID << TTraits::GetTileIDShift(); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes the byte offset for 2D tiled surfaces -/// @param pitch - surface pitch in bytes -/// @param x - x offset in bytes -/// @param y - y offset in rows -template<typename TTraits> -INLINE UINT ComputeOffset2D(UINT pitch, UINT x, UINT y) -{ - UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv()); - UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX()); - UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY()); - return (tileID | xSwizzle | ySwizzle); -} - -#if KNOB_ARCH <= KNOB_ARCH_AVX -////////////////////////////////////////////////////////////////////////// -/// @brief Computes the byte offset for 2D tiled surfaces. Specialization -/// for tile-y surfaces that uses bit twiddling instead of pdep emulation. -/// @param pitch - surface pitch in bytes -/// @param x - x offset in bytes -/// @param y - y offset in rows -template<> -INLINE UINT ComputeOffset2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(UINT pitch, UINT x, UINT y) -{ - typedef TilingTraits<SWR_TILE_MODE_YMAJOR, 32> TTraits; - - UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv()); - UINT xSwizzle = ((x << 5) & 0xe00) | (x & 0xf); - UINT ySwizzle = (y << 4) & 0x1f0; - return (tileID | xSwizzle | ySwizzle); -} -#endif - -////////////////////////////////////////////////////////////////////////// -/// @brief Computes the byte offset for 3D tiled surfaces -/// @param qpitch - depth pitch in rows -/// @param pitch - surface pitch in bytes -/// @param x - x offset in bytes -/// @param y - y offset in rows -/// @param z - y offset in slices -template<typename TTraits> -INLINE UINT ComputeOffset3D(UINT qpitch, UINT pitch, UINT x, UINT y, UINT z) -{ - UINT tileID = ComputeTileOffset3D<TTraits>(qpitch, pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv(), z >> TTraits::GetCr()); - UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX()); - UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY()); - return (tileID | xSwizzle | ySwizzle); -} diff --git a/src/gallium/drivers/swr/swr_clear.cpp b/src/gallium/drivers/swr/swr_clear.cpp deleted file mode 100644 index d579cbdde9f..00000000000 --- a/src/gallium/drivers/swr/swr_clear.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include "swr_context.h" -#include "swr_query.h" - -static void -swr_clear(struct pipe_context *pipe, - unsigned buffers, - const struct pipe_scissor_state *scissor_state, - const union pipe_color_union *color, - double depth, - unsigned stencil) -{ - struct swr_context *ctx = swr_context(pipe); - struct pipe_framebuffer_state *fb = &ctx->framebuffer; - - UINT clearMask = 0; - unsigned layers = 0; - - if (!swr_check_render_cond(pipe)) - return; - - swr_update_derived(pipe); - - if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) { - for (unsigned i = 0; i < fb->nr_cbufs; ++i) - if (fb->cbufs[i] && (buffers & (PIPE_CLEAR_COLOR0 << i))) { - clearMask |= (SWR_ATTACHMENT_COLOR0_BIT << i); - layers = std::max(layers, fb->cbufs[i]->u.tex.last_layer - - fb->cbufs[i]->u.tex.first_layer + 1u); - } - } - - if (buffers & PIPE_CLEAR_DEPTH && fb->zsbuf) { - clearMask |= SWR_ATTACHMENT_DEPTH_BIT; - layers = std::max(layers, fb->zsbuf->u.tex.last_layer - - fb->zsbuf->u.tex.first_layer + 1u); - } - - if (buffers & PIPE_CLEAR_STENCIL && fb->zsbuf) { - clearMask |= SWR_ATTACHMENT_STENCIL_BIT; - layers = std::max(layers, fb->zsbuf->u.tex.last_layer - - fb->zsbuf->u.tex.first_layer + 1u); - } - -#if 0 // XXX HACK, override clear color alpha. On ubuntu, clears are - // transparent. - ((union pipe_color_union *)color)->f[3] = 1.0; /* cast off your const'd-ness */ -#endif - - /* - * Always clear full surface. When GL_SCISSOR_TEST is enabled - * glClear is handled by state tracker and there is no need to do this here - */ - SWR_RECT clear_rect = {0, 0, (int32_t)fb->width, (int32_t)fb->height}; - - for (unsigned i = 0; i < layers; ++i) { - swr_update_draw_context(ctx); - ctx->api.pfnSwrClearRenderTarget(ctx->swrContext, clearMask, i, - color->f, depth, stencil, - clear_rect); - - // Mask out the attachments that are out of layers. - if (fb->zsbuf && - (fb->zsbuf->u.tex.last_layer <= fb->zsbuf->u.tex.first_layer + i)) - clearMask &= ~(SWR_ATTACHMENT_DEPTH_BIT | SWR_ATTACHMENT_STENCIL_BIT); - for (unsigned c = 0; c < fb->nr_cbufs; ++c) { - const struct pipe_surface *sf = fb->cbufs[c]; - if (sf && (sf->u.tex.last_layer <= sf->u.tex.first_layer + i)) - clearMask &= ~(SWR_ATTACHMENT_COLOR0_BIT << c); - } - } -} - -void -swr_clear_init(struct pipe_context *pipe) -{ - pipe->clear = swr_clear; -} diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp deleted file mode 100644 index 08637dba1d5..00000000000 --- a/src/gallium/drivers/swr/swr_context.cpp +++ /dev/null @@ -1,595 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include "swr_context.h" -#include "swr_memory.h" -#include "swr_screen.h" -#include "swr_resource.h" -#include "swr_scratch.h" -#include "swr_query.h" -#include "swr_fence.h" - -#include "util/u_memory.h" -#include "util/u_inlines.h" -#include "util/format/u_format.h" -#include "util/u_atomic.h" -#include "util/u_upload_mgr.h" -#include "util/u_transfer.h" -#include "util/u_surface.h" - -#include "api.h" -#include "backend.h" -#include "knobs.h" - -static struct pipe_surface * -swr_create_surface(struct pipe_context *pipe, - struct pipe_resource *pt, - const struct pipe_surface *surf_tmpl) -{ - struct pipe_surface *ps; - - ps = CALLOC_STRUCT(pipe_surface); - if (ps) { - pipe_reference_init(&ps->reference, 1); - pipe_resource_reference(&ps->texture, pt); - ps->context = pipe; - ps->format = surf_tmpl->format; - if (pt->target != PIPE_BUFFER) { - assert(surf_tmpl->u.tex.level <= pt->last_level); - ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level); - ps->height = u_minify(pt->height0, surf_tmpl->u.tex.level); - ps->u.tex.level = surf_tmpl->u.tex.level; - ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer; - ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer; - } else { - /* setting width as number of elements should get us correct - * renderbuffer width */ - ps->width = surf_tmpl->u.buf.last_element - - surf_tmpl->u.buf.first_element + 1; - ps->height = pt->height0; - ps->u.buf.first_element = surf_tmpl->u.buf.first_element; - ps->u.buf.last_element = surf_tmpl->u.buf.last_element; - assert(ps->u.buf.first_element <= ps->u.buf.last_element); - assert(ps->u.buf.last_element < ps->width); - } - } - return ps; -} - -static void -swr_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surf) -{ - assert(surf->texture); - struct pipe_resource *resource = surf->texture; - - /* If the resource has been drawn to, store tiles. */ - swr_store_dirty_resource(pipe, resource, SWR_TILE_RESOLVED); - - pipe_resource_reference(&resource, NULL); - FREE(surf); -} - - -static void * -swr_transfer_map(struct pipe_context *pipe, - struct pipe_resource *resource, - unsigned level, - unsigned usage, - const struct pipe_box *box, - struct pipe_transfer **transfer) -{ - struct swr_screen *screen = swr_screen(pipe->screen); - struct swr_resource *spr = swr_resource(resource); - struct pipe_transfer *pt; - enum pipe_format format = resource->format; - - assert(resource); - assert(level <= resource->last_level); - - /* If mapping an attached rendertarget, store tiles to surface and set - * postStoreTileState to SWR_TILE_INVALID so tiles get reloaded on next use - * and nothing needs to be done at unmap. */ - swr_store_dirty_resource(pipe, resource, SWR_TILE_INVALID); - - if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) { - /* If resource is in use, finish fence before mapping. - * Unless requested not to block, then if not done return NULL map */ - if (usage & PIPE_MAP_DONTBLOCK) { - if (swr_is_fence_pending(screen->flush_fence)) - return NULL; - } else { - if (spr->status) { - /* But, if there's no fence pending, submit one. - * XXX: Remove once draw timestamps are finished. */ - if (!swr_is_fence_pending(screen->flush_fence)) - swr_fence_submit(swr_context(pipe), screen->flush_fence); - - swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0); - swr_resource_unused(resource); - } - } - } - - pt = CALLOC_STRUCT(pipe_transfer); - if (!pt) - return NULL; - pipe_resource_reference(&pt->resource, resource); - pt->usage = (pipe_map_flags)usage; - pt->level = level; - pt->box = *box; - pt->stride = spr->swr.pitch; - pt->layer_stride = spr->swr.qpitch * spr->swr.pitch; - - /* if we're mapping the depth/stencil, copy in stencil for the section - * being read in - */ - if (usage & PIPE_MAP_READ && spr->has_depth && spr->has_stencil) { - size_t zbase, sbase; - for (int z = box->z; z < box->z + box->depth; z++) { - zbase = (z * spr->swr.qpitch + box->y) * spr->swr.pitch + - spr->mip_offsets[level]; - sbase = (z * spr->secondary.qpitch + box->y) * spr->secondary.pitch + - spr->secondary_mip_offsets[level]; - for (int y = box->y; y < box->y + box->height; y++) { - if (spr->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) { - for (int x = box->x; x < box->x + box->width; x++) - ((uint8_t*)(spr->swr.xpBaseAddress))[zbase + 4 * x + 3] = - ((uint8_t*)(spr->secondary.xpBaseAddress))[sbase + x]; - } else if (spr->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { - for (int x = box->x; x < box->x + box->width; x++) - ((uint8_t*)(spr->swr.xpBaseAddress))[zbase + 8 * x + 4] = - ((uint8_t*)(spr->secondary.xpBaseAddress))[sbase + x]; - } - zbase += spr->swr.pitch; - sbase += spr->secondary.pitch; - } - } - } - - unsigned offset = box->z * pt->layer_stride + - util_format_get_nblocksy(format, box->y) * pt->stride + - util_format_get_stride(format, box->x); - - *transfer = pt; - - return (void*)(spr->swr.xpBaseAddress + offset + spr->mip_offsets[level]); -} - -static void -swr_transfer_flush_region(struct pipe_context *pipe, - struct pipe_transfer *transfer, - const struct pipe_box *flush_box) -{ - assert(transfer->resource); - assert(transfer->usage & PIPE_MAP_WRITE); - - struct swr_resource *spr = swr_resource(transfer->resource); - if (!spr->has_depth || !spr->has_stencil) - return; - - size_t zbase, sbase; - struct pipe_box box = *flush_box; - box.x += transfer->box.x; - box.y += transfer->box.y; - box.z += transfer->box.z; - for (int z = box.z; z < box.z + box.depth; z++) { - zbase = (z * spr->swr.qpitch + box.y) * spr->swr.pitch + - spr->mip_offsets[transfer->level]; - sbase = (z * spr->secondary.qpitch + box.y) * spr->secondary.pitch + - spr->secondary_mip_offsets[transfer->level]; - for (int y = box.y; y < box.y + box.height; y++) { - if (spr->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) { - for (int x = box.x; x < box.x + box.width; x++) - ((uint8_t*)(spr->secondary.xpBaseAddress))[sbase + x] = - ((uint8_t*)(spr->swr.xpBaseAddress))[zbase + 4 * x + 3]; - } else if (spr->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { - for (int x = box.x; x < box.x + box.width; x++) - ((uint8_t*)(spr->secondary.xpBaseAddress))[sbase + x] = - ((uint8_t*)(spr->swr.xpBaseAddress))[zbase + 8 * x + 4]; - } - zbase += spr->swr.pitch; - sbase += spr->secondary.pitch; - } - } -} - -static void -swr_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *transfer) -{ - assert(transfer->resource); - - struct swr_resource *spr = swr_resource(transfer->resource); - /* if we're mapping the depth/stencil, copy in stencil for the section - * being written out - */ - if (transfer->usage & PIPE_MAP_WRITE && - !(transfer->usage & PIPE_MAP_FLUSH_EXPLICIT) && - spr->has_depth && spr->has_stencil) { - struct pipe_box box; - u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, - transfer->box.depth, &box); - swr_transfer_flush_region(pipe, transfer, &box); - } - - pipe_resource_reference(&transfer->resource, NULL); - FREE(transfer); -} - - -static void -swr_resource_copy(struct pipe_context *pipe, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, - unsigned dsty, - unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) -{ - struct swr_screen *screen = swr_screen(pipe->screen); - - /* If either the src or dst is a renderTarget, store tiles before copy */ - swr_store_dirty_resource(pipe, src, SWR_TILE_RESOLVED); - swr_store_dirty_resource(pipe, dst, SWR_TILE_RESOLVED); - - swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0); - swr_resource_unused(src); - swr_resource_unused(dst); - - if ((dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) - || (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER)) { - util_resource_copy_region( - pipe, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box); - return; - } - - debug_printf("unhandled swr_resource_copy\n"); -} - - -static void -swr_blit(struct pipe_context *pipe, const struct pipe_blit_info *blit_info) -{ - struct swr_context *ctx = swr_context(pipe); - /* Make a copy of the const blit_info, so we can modify it */ - struct pipe_blit_info info = *blit_info; - - if (info.render_condition_enable && !swr_check_render_cond(pipe)) - return; - - if (info.src.resource->nr_samples > 1 && info.dst.resource->nr_samples <= 1 - && !util_format_is_depth_or_stencil(info.src.resource->format) - && !util_format_is_pure_integer(info.src.resource->format)) { - debug_printf("swr_blit: color resolve : %d -> %d\n", - info.src.resource->nr_samples, info.dst.resource->nr_samples); - - /* Resolve is done as part of the surface store. */ - swr_store_dirty_resource(pipe, info.src.resource, SWR_TILE_RESOLVED); - - struct pipe_resource *src_resource = info.src.resource; - struct pipe_resource *resolve_target = - swr_resource(src_resource)->resolve_target; - - /* The resolve target becomes the new source for the blit. */ - info.src.resource = resolve_target; - } - - if (util_try_blit_via_copy_region(pipe, &info, ctx->render_cond_query != NULL)) { - return; /* done */ - } - - if (info.mask & PIPE_MASK_S) { - debug_printf("swr: cannot blit stencil, skipping\n"); - info.mask &= ~PIPE_MASK_S; - } - - if (!util_blitter_is_blit_supported(ctx->blitter, &info)) { - debug_printf("swr: blit unsupported %s -> %s\n", - util_format_short_name(info.src.resource->format), - util_format_short_name(info.dst.resource->format)); - return; - } - - if (ctx->active_queries) { - ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, FALSE); - ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, FALSE); - } - - util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vertex_buffer); - util_blitter_save_vertex_elements(ctx->blitter, (void *)ctx->velems); - util_blitter_save_vertex_shader(ctx->blitter, (void *)ctx->vs); - util_blitter_save_geometry_shader(ctx->blitter, (void*)ctx->gs); - util_blitter_save_tessctrl_shader(ctx->blitter, (void*)ctx->tcs); - util_blitter_save_tesseval_shader(ctx->blitter, (void*)ctx->tes); - util_blitter_save_so_targets( - ctx->blitter, - ctx->num_so_targets, - (struct pipe_stream_output_target **)ctx->so_targets); - util_blitter_save_rasterizer(ctx->blitter, (void *)ctx->rasterizer); - util_blitter_save_viewport(ctx->blitter, &ctx->viewports[0]); - util_blitter_save_scissor(ctx->blitter, &ctx->scissors[0]); - util_blitter_save_fragment_shader(ctx->blitter, ctx->fs); - util_blitter_save_blend(ctx->blitter, (void *)ctx->blend); - util_blitter_save_depth_stencil_alpha(ctx->blitter, - (void *)ctx->depth_stencil); - util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref); - util_blitter_save_sample_mask(ctx->blitter, ctx->sample_mask, 0); - util_blitter_save_framebuffer(ctx->blitter, &ctx->framebuffer); - util_blitter_save_fragment_sampler_states( - ctx->blitter, - ctx->num_samplers[PIPE_SHADER_FRAGMENT], - (void **)ctx->samplers[PIPE_SHADER_FRAGMENT]); - util_blitter_save_fragment_sampler_views( - ctx->blitter, - ctx->num_sampler_views[PIPE_SHADER_FRAGMENT], - ctx->sampler_views[PIPE_SHADER_FRAGMENT]); - util_blitter_save_render_condition(ctx->blitter, - ctx->render_cond_query, - ctx->render_cond_cond, - ctx->render_cond_mode); - - util_blitter_blit(ctx->blitter, &info); - - if (ctx->active_queries) { - ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, TRUE); - ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, TRUE); - } -} - - -static void -swr_destroy(struct pipe_context *pipe) -{ - struct swr_context *ctx = swr_context(pipe); - struct swr_screen *screen = swr_screen(pipe->screen); - - if (ctx->blitter) - util_blitter_destroy(ctx->blitter); - - for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; i++) { - if (ctx->framebuffer.cbufs[i]) { - struct swr_resource *res = swr_resource(ctx->framebuffer.cbufs[i]->texture); - /* NULL curr_pipe, so we don't have a reference to a deleted pipe */ - res->curr_pipe = NULL; - pipe_surface_reference(&ctx->framebuffer.cbufs[i], NULL); - } - } - - if (ctx->framebuffer.zsbuf) { - struct swr_resource *res = swr_resource(ctx->framebuffer.zsbuf->texture); - /* NULL curr_pipe, so we don't have a reference to a deleted pipe */ - res->curr_pipe = NULL; - pipe_surface_reference(&ctx->framebuffer.zsbuf, NULL); - } - - for (unsigned i = 0; i < ARRAY_SIZE(ctx->sampler_views[0]); i++) { - pipe_sampler_view_reference(&ctx->sampler_views[PIPE_SHADER_FRAGMENT][i], NULL); - } - - for (unsigned i = 0; i < ARRAY_SIZE(ctx->sampler_views[0]); i++) { - pipe_sampler_view_reference(&ctx->sampler_views[PIPE_SHADER_VERTEX][i], NULL); - } - - if (ctx->pipe.stream_uploader) - u_upload_destroy(ctx->pipe.stream_uploader); - - /* Idle core after destroying buffer resources, but before deleting - * context. Destroying resources has potentially called StoreTiles.*/ - ctx->api.pfnSwrWaitForIdle(ctx->swrContext); - - if (ctx->swrContext) - ctx->api.pfnSwrDestroyContext(ctx->swrContext); - - delete ctx->blendJIT; - - swr_destroy_scratch_buffers(ctx); - - - /* Only update screen->pipe if current context is being destroyed */ - assert(screen); - if (screen->pipe == pipe) - screen->pipe = NULL; - - AlignedFree(ctx); -} - - -static void -swr_render_condition(struct pipe_context *pipe, - struct pipe_query *query, - bool condition, - enum pipe_render_cond_flag mode) -{ - struct swr_context *ctx = swr_context(pipe); - - ctx->render_cond_query = query; - ctx->render_cond_mode = mode; - ctx->render_cond_cond = condition; -} - - -static void -swr_flush_resource(struct pipe_context *ctx, struct pipe_resource *resource) -{ - // NOOP -} - -static void -swr_UpdateStats(HANDLE hPrivateContext, const SWR_STATS *pStats) -{ - swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; - - if (!pDC) - return; - - struct swr_query_result *pqr = pDC->pStats; - - SWR_STATS *pSwrStats = &pqr->core; - - pSwrStats->DepthPassCount += pStats->DepthPassCount; - pSwrStats->PsInvocations += pStats->PsInvocations; - pSwrStats->CsInvocations += pStats->CsInvocations; -} - -static void -swr_UpdateStatsFE(HANDLE hPrivateContext, const SWR_STATS_FE *pStats) -{ - swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; - - if (!pDC) - return; - - struct swr_query_result *pqr = pDC->pStats; - - SWR_STATS_FE *pSwrStats = &pqr->coreFE; - p_atomic_add(&pSwrStats->IaVertices, pStats->IaVertices); - p_atomic_add(&pSwrStats->IaPrimitives, pStats->IaPrimitives); - p_atomic_add(&pSwrStats->VsInvocations, pStats->VsInvocations); - p_atomic_add(&pSwrStats->HsInvocations, pStats->HsInvocations); - p_atomic_add(&pSwrStats->DsInvocations, pStats->DsInvocations); - p_atomic_add(&pSwrStats->GsInvocations, pStats->GsInvocations); - p_atomic_add(&pSwrStats->CInvocations, pStats->CInvocations); - p_atomic_add(&pSwrStats->CPrimitives, pStats->CPrimitives); - p_atomic_add(&pSwrStats->GsPrimitives, pStats->GsPrimitives); - - for (unsigned i = 0; i < 4; i++) { - p_atomic_add(&pSwrStats->SoPrimStorageNeeded[i], - pStats->SoPrimStorageNeeded[i]); - p_atomic_add(&pSwrStats->SoNumPrimsWritten[i], - pStats->SoNumPrimsWritten[i]); - } -} - -static void -swr_UpdateStreamOut(HANDLE hPrivateContext, uint64_t numPrims) -{ - swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; - - if (!pDC) - return; - - if (pDC->soPrims) - *pDC->soPrims += numPrims; -} - -struct pipe_context * -swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags) -{ - struct swr_context *ctx = (struct swr_context *) - AlignedMalloc(sizeof(struct swr_context), KNOB_SIMD_BYTES); - memset((void*)ctx, 0, sizeof(struct swr_context)); - - swr_screen(p_screen)->pfnSwrGetInterface(ctx->api); - swr_screen(p_screen)->pfnSwrGetTileInterface(ctx->tileApi); - ctx->swrDC.pAPI = &ctx->api; - ctx->swrDC.pTileAPI = &ctx->tileApi; - - ctx->blendJIT = - new std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC>; - - ctx->max_draws_in_flight = KNOB_MAX_DRAWS_IN_FLIGHT; - - SWR_CREATECONTEXT_INFO createInfo {0}; - - createInfo.privateStateSize = sizeof(swr_draw_context); - createInfo.pfnLoadTile = swr_LoadHotTile; - createInfo.pfnStoreTile = swr_StoreHotTile; - createInfo.pfnUpdateStats = swr_UpdateStats; - createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE; - createInfo.pfnUpdateStreamOut = swr_UpdateStreamOut; - createInfo.pfnMakeGfxPtr = swr_MakeGfxPtr; - - SWR_THREADING_INFO threadingInfo {0}; - - threadingInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; - threadingInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; - threadingInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; - threadingInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; - threadingInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; - - // Use non-standard settings for KNL - if (swr_screen(p_screen)->is_knl) - { - if (nullptr == getenv("KNOB_MAX_THREADS_PER_CORE")) - threadingInfo.MAX_THREADS_PER_CORE = 2; - - if (nullptr == getenv("KNOB_MAX_DRAWS_IN_FLIGHT")) - { - ctx->max_draws_in_flight = 2048; - createInfo.MAX_DRAWS_IN_FLIGHT = ctx->max_draws_in_flight; - } - } - - createInfo.pThreadInfo = &threadingInfo; - - ctx->swrContext = ctx->api.pfnSwrCreateContext(&createInfo); - - ctx->api.pfnSwrInit(); - - if (ctx->swrContext == NULL) - goto fail; - - ctx->pipe.screen = p_screen; - ctx->pipe.destroy = swr_destroy; - ctx->pipe.priv = priv; - ctx->pipe.create_surface = swr_create_surface; - ctx->pipe.surface_destroy = swr_surface_destroy; - ctx->pipe.buffer_map = swr_transfer_map; - ctx->pipe.buffer_unmap = swr_transfer_unmap; - ctx->pipe.texture_map = swr_transfer_map; - ctx->pipe.texture_unmap = swr_transfer_unmap; - ctx->pipe.transfer_flush_region = swr_transfer_flush_region; - - ctx->pipe.buffer_subdata = u_default_buffer_subdata; - ctx->pipe.texture_subdata = u_default_texture_subdata; - - ctx->pipe.clear_texture = util_clear_texture; - ctx->pipe.resource_copy_region = swr_resource_copy; - ctx->pipe.flush_resource = swr_flush_resource; - ctx->pipe.render_condition = swr_render_condition; - - swr_state_init(&ctx->pipe); - swr_clear_init(&ctx->pipe); - swr_draw_init(&ctx->pipe); - swr_query_init(&ctx->pipe); - - ctx->pipe.stream_uploader = u_upload_create_default(&ctx->pipe); - if (!ctx->pipe.stream_uploader) - goto fail; - ctx->pipe.const_uploader = ctx->pipe.stream_uploader; - - ctx->pipe.blit = swr_blit; - ctx->blitter = util_blitter_create(&ctx->pipe); - if (!ctx->blitter) - goto fail; - - swr_init_scratch_buffers(ctx); - - return &ctx->pipe; - -fail: - /* Should really validate the init steps and fail gracefully */ - swr_destroy(&ctx->pipe); - return NULL; -} diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h deleted file mode 100644 index 11578764c23..00000000000 --- a/src/gallium/drivers/swr/swr_context.h +++ /dev/null @@ -1,236 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#ifndef SWR_CONTEXT_H -#define SWR_CONTEXT_H - -#include "common/os.h" - -#include "pipe/p_context.h" -#include "pipe/p_state.h" -#include "util/u_blitter.h" -#include "rasterizer/memory/SurfaceState.h" -#include "rasterizer/memory/InitMemory.h" -#include "jit_api.h" -#include "swr_state.h" -#include <unordered_map> - -#define SWR_NEW_BLEND (1 << 0) -#define SWR_NEW_RASTERIZER (1 << 1) -#define SWR_NEW_DEPTH_STENCIL_ALPHA (1 << 2) -#define SWR_NEW_SAMPLER (1 << 3) -#define SWR_NEW_SAMPLER_VIEW (1 << 4) -#define SWR_NEW_VS (1 << 5) -#define SWR_NEW_FS (1 << 6) -#define SWR_NEW_GS (1 << 7) -#define SWR_NEW_VSCONSTANTS (1 << 8) -#define SWR_NEW_FSCONSTANTS (1 << 9) -#define SWR_NEW_GSCONSTANTS (1 << 10) -#define SWR_NEW_VERTEX (1 << 11) -#define SWR_NEW_STIPPLE (1 << 12) -#define SWR_NEW_SCISSOR (1 << 13) -#define SWR_NEW_VIEWPORT (1 << 14) -#define SWR_NEW_FRAMEBUFFER (1 << 15) -#define SWR_NEW_CLIP (1 << 16) -#define SWR_NEW_SO (1 << 17) -#define SWR_BLOCK_CLIENT_DRAW ( 1 << 18) // Indicates client draw will block -#define SWR_NEW_TCS (1 << 19) -#define SWR_NEW_TES (1 << 20) -#define SWR_NEW_TS (1 << 21) -#define SWR_NEW_TCSCONSTANTS (1 << 22) -#define SWR_NEW_TESCONSTANTS (1 << 23) - -namespace std -{ -template <> struct hash<BLEND_COMPILE_STATE> { - std::size_t operator()(const BLEND_COMPILE_STATE &k) const - { - return util_hash_crc32(&k, sizeof(k)); - } -}; -}; - -struct swr_jit_texture { - uint32_t width; // same as number of elements - uint32_t height; - uint32_t depth; // doubles as array size - uint32_t first_level; - uint32_t last_level; - const uint8_t *base_ptr; - uint32_t num_samples; - uint32_t sample_stride; - uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS]; - uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS]; - uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; -}; - -struct swr_jit_sampler { - float min_lod; - float max_lod; - float lod_bias; - float border_color[4]; -}; - -struct swr_draw_context { - const float *constantVS[PIPE_MAX_CONSTANT_BUFFERS]; - uint32_t num_constantsVS[PIPE_MAX_CONSTANT_BUFFERS]; - const float *constantFS[PIPE_MAX_CONSTANT_BUFFERS]; - uint32_t num_constantsFS[PIPE_MAX_CONSTANT_BUFFERS]; - const float *constantGS[PIPE_MAX_CONSTANT_BUFFERS]; - uint32_t num_constantsGS[PIPE_MAX_CONSTANT_BUFFERS]; - const float *constantTCS[PIPE_MAX_CONSTANT_BUFFERS]; - uint32_t num_constantsTCS[PIPE_MAX_CONSTANT_BUFFERS]; - const float *constantTES[PIPE_MAX_CONSTANT_BUFFERS]; - uint32_t num_constantsTES[PIPE_MAX_CONSTANT_BUFFERS]; - - swr_jit_texture texturesVS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; - swr_jit_sampler samplersVS[PIPE_MAX_SAMPLERS]; - swr_jit_texture texturesFS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; - swr_jit_sampler samplersFS[PIPE_MAX_SAMPLERS]; - swr_jit_texture texturesGS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; - swr_jit_sampler samplersGS[PIPE_MAX_SAMPLERS]; - swr_jit_texture texturesTCS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; - swr_jit_sampler samplersTCS[PIPE_MAX_SAMPLERS]; - swr_jit_texture texturesTES[PIPE_MAX_SHADER_SAMPLER_VIEWS]; - swr_jit_sampler samplersTES[PIPE_MAX_SAMPLERS]; - - float userClipPlanes[PIPE_MAX_CLIP_PLANES][4]; - - uint32_t polyStipple[32]; - - SWR_SURFACE_STATE renderTargets[SWR_NUM_ATTACHMENTS]; - struct swr_query_result *pStats; // @llvm_struct - SWR_INTERFACE *pAPI; // @llvm_struct - Needed for the swr_memory callbacks - SWR_TILE_INTERFACE *pTileAPI; // @llvm_struct - Needed for the swr_memory callbacks - - uint64_t* soPrims; //number of primitives written to StreamOut buffer -}; - -/* gen_llvm_types FINI */ - -struct swr_context { - struct pipe_context pipe; /**< base class */ - - HANDLE swrContext; - - SWR_TS_STATE tsState; - - /** Constant state objects */ - struct swr_blend_state *blend; - struct pipe_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; - struct pipe_depth_stencil_alpha_state *depth_stencil; - struct pipe_rasterizer_state *rasterizer; - - struct swr_vertex_shader *vs; - struct swr_fragment_shader *fs; - struct swr_geometry_shader *gs; - struct swr_tess_control_shader *tcs; - struct swr_tess_evaluation_shader *tes; - struct swr_vertex_element_state *velems; - - /** Other rendering state */ - struct pipe_blend_color blend_color; - struct pipe_stencil_ref stencil_ref; - struct pipe_clip_state clip; - struct pipe_constant_buffer - constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS]; - struct pipe_framebuffer_state framebuffer; - struct swr_poly_stipple poly_stipple; - struct pipe_scissor_state scissors[KNOB_NUM_VIEWPORTS_SCISSORS]; - SWR_RECT swr_scissors[KNOB_NUM_VIEWPORTS_SCISSORS]; - struct pipe_sampler_view * - sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; - - struct pipe_viewport_state viewports[KNOB_NUM_VIEWPORTS_SCISSORS]; - struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; - - struct blitter_context *blitter; - - /** Conditional query object and mode */ - struct pipe_query *render_cond_query; - enum pipe_render_cond_flag render_cond_mode; - bool render_cond_cond; - unsigned active_queries; - - unsigned num_vertex_buffers; - unsigned num_samplers[PIPE_SHADER_TYPES]; - unsigned num_sampler_views[PIPE_SHADER_TYPES]; - - unsigned sample_mask; - - // streamout - pipe_stream_output_target *so_targets[MAX_SO_STREAMS]; - uint32_t num_so_targets; - uint64_t so_primCounter; // number of primitives written to StreamOut buffer - - /* Temp storage for user_buffer constants */ - struct swr_scratch_buffers *scratch; - - // blend jit functions - std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC> *blendJIT; - - /* Derived SWR API DrawState */ - struct swr_derived_state derived; - - /* SWR private state - draw context */ - struct swr_draw_context swrDC; - - unsigned dirty; /**< Mask of SWR_NEW_x flags */ - - SWR_INTERFACE api; - SWR_TILE_INTERFACE tileApi; - - uint32_t max_draws_in_flight; - uint8_t patch_vertices; -}; - -static INLINE struct swr_context * -swr_context(struct pipe_context *pipe) -{ - return (struct swr_context *)pipe; -} - -static INLINE void -swr_update_draw_context(struct swr_context *ctx, - struct swr_query_result *pqr = nullptr) -{ - swr_draw_context *pDC = - (swr_draw_context *)ctx->api.pfnSwrGetPrivateContextState(ctx->swrContext); - if (pqr) - ctx->swrDC.pStats = pqr; - memcpy(pDC, &ctx->swrDC, sizeof(swr_draw_context)); -} - -struct pipe_context *swr_create_context(struct pipe_screen *, void *priv, unsigned flags); - -void swr_state_init(struct pipe_context *pipe); - -void swr_clear_init(struct pipe_context *pipe); - -void swr_draw_init(struct pipe_context *pipe); - -void swr_finish(struct pipe_context *pipe); - -void swr_do_msaa_resolve(struct pipe_resource *src_resource, - struct pipe_resource *dst_resource); -#endif diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp deleted file mode 100644 index 4b42a8e0390..00000000000 --- a/src/gallium/drivers/swr/swr_draw.cpp +++ /dev/null @@ -1,399 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include "swr_screen.h" -#include "swr_context.h" -#include "swr_resource.h" -#include "swr_fence.h" -#include "swr_query.h" -#include "jit_api.h" - -#include "util/u_draw.h" -#include "util/u_prim.h" - -#include <algorithm> -#include <iostream> -/* - * Draw vertex arrays, with optional indexing, optional instancing. - */ -static void -swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info, - unsigned drawid_offset, - const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count_bias *draws, - unsigned num_draws) -{ - if (num_draws > 1) { - struct pipe_draw_info tmp_info = *info; - unsigned drawid = drawid_offset; - - for (unsigned i = 0; i < num_draws; i++) { - swr_draw_vbo(pipe, &tmp_info, drawid, indirect, &draws[i], 1); - if (tmp_info.increment_draw_id) - drawid++; - } - return; - } - - if (!indirect && (!draws[0].count || !info->instance_count)) - return; - - struct swr_context *ctx = swr_context(pipe); - - if (!indirect && - !info->primitive_restart && - !u_trim_pipe_prim(info->mode, (unsigned*)&draws[0].count)) - return; - - if (!swr_check_render_cond(pipe)) - return; - - if (indirect && indirect->buffer) { - util_draw_indirect(pipe, info, indirect); - return; - } - - /* If indexed draw, force vertex validation since index buffer comes - * from draw info. */ - if (info->index_size) - ctx->dirty |= SWR_NEW_VERTEX; - - /* Update derived state, pass draw info to update function. */ - swr_update_derived(pipe, info, draws); - - swr_update_draw_context(ctx); - - struct pipe_draw_info resolved_info; - struct pipe_draw_start_count_bias resolved_draw; - /* DrawTransformFeedback */ - if (indirect && indirect->count_from_stream_output) { - // trick copied from softpipe to modify const struct *info - memcpy(&resolved_info, (void*)info, sizeof(struct pipe_draw_info)); - resolved_draw.start = draws[0].start; - resolved_draw.count = ctx->so_primCounter * ctx->patch_vertices; - resolved_info.max_index = resolved_draw.count - 1; - info = &resolved_info; - indirect = NULL; - draws = &resolved_draw; - } - - if (ctx->vs->pipe.stream_output.num_outputs) { - if (!ctx->vs->soFunc[info->mode]) { - STREAMOUT_COMPILE_STATE state = {0}; - struct pipe_stream_output_info *so = &ctx->vs->pipe.stream_output; - - state.numVertsPerPrim = u_vertices_per_prim(info->mode); - - uint32_t offsets[MAX_SO_STREAMS] = {0}; - uint32_t num = 0; - - for (uint32_t i = 0; i < so->num_outputs; i++) { - assert(so->output[i].stream == 0); // @todo - uint32_t output_buffer = so->output[i].output_buffer; - if (so->output[i].dst_offset != offsets[output_buffer]) { - // hole - need to fill - state.stream.decl[num].bufferIndex = output_buffer; - state.stream.decl[num].hole = true; - state.stream.decl[num].componentMask = - (1 << (so->output[i].dst_offset - offsets[output_buffer])) - - 1; - num++; - offsets[output_buffer] = so->output[i].dst_offset; - } - - unsigned attrib_slot = so->output[i].register_index; - attrib_slot = swr_so_adjust_attrib(attrib_slot, ctx->vs); - - state.stream.decl[num].bufferIndex = output_buffer; - state.stream.decl[num].attribSlot = attrib_slot; - state.stream.decl[num].componentMask = - ((1 << so->output[i].num_components) - 1) - << so->output[i].start_component; - state.stream.decl[num].hole = false; - num++; - - offsets[output_buffer] += so->output[i].num_components; - } - - state.stream.numDecls = num; - - HANDLE hJitMgr = swr_screen(pipe->screen)->hJitMgr; - ctx->vs->soFunc[info->mode] = JitCompileStreamout(hJitMgr, state); - debug_printf("so shader %p\n", ctx->vs->soFunc[info->mode]); - assert(ctx->vs->soFunc[info->mode] && "Error: SoShader = NULL"); - } - - ctx->api.pfnSwrSetSoFunc(ctx->swrContext, ctx->vs->soFunc[info->mode], 0); - } - - struct swr_vertex_element_state *velems = ctx->velems; - if (info->primitive_restart) - velems->fsState.cutIndex = info->restart_index; - else - velems->fsState.cutIndex = 0; - velems->fsState.bEnableCutIndex = info->primitive_restart; - velems->fsState.bPartialVertexBuffer = (info->index_bounds_valid && info->min_index > 0); - - swr_jit_fetch_key key; - swr_generate_fetch_key(key, velems); - auto search = velems->map.find(key); - if (search != velems->map.end()) { - velems->fsFunc = search->second; - } else { - HANDLE hJitMgr = swr_screen(ctx->pipe.screen)->hJitMgr; - velems->fsFunc = JitCompileFetch(hJitMgr, velems->fsState); - - debug_printf("fetch shader %p\n", velems->fsFunc); - assert(velems->fsFunc && "Error: FetchShader = NULL"); - - velems->map.insert(std::make_pair(key, velems->fsFunc)); - } - - ctx->api.pfnSwrSetFetchFunc(ctx->swrContext, velems->fsFunc); - - /* Set up frontend state - * XXX setup provokingVertex & topologyProvokingVertex */ - SWR_FRONTEND_STATE feState = {0}; - - // feState.vsVertexSize seeds the PA size that is used as an interface - // between all the shader stages, so it has to be large enough to - // incorporate all interfaces between stages - - // max of frontend shaders num_outputs - feState.vsVertexSize = ctx->vs->info.base.num_outputs; - if (ctx->gs) { - feState.vsVertexSize = std::max(feState.vsVertexSize, (uint32_t)ctx->gs->info.base.num_outputs); - } - if (ctx->tcs) { - feState.vsVertexSize = std::max(feState.vsVertexSize, (uint32_t)ctx->tcs->info.base.num_outputs); - } - if (ctx->tes) { - feState.vsVertexSize = std::max(feState.vsVertexSize, (uint32_t)ctx->tes->info.base.num_outputs); - } - - - if (ctx->vs->info.base.num_outputs) { - // gs does not adjust for position in SGV slot at input from vs - if (!ctx->gs && !ctx->tcs && !ctx->tes) - feState.vsVertexSize--; - } - - // other (non-SGV) slots start at VERTEX_ATTRIB_START_SLOT - feState.vsVertexSize += VERTEX_ATTRIB_START_SLOT; - - // The PA in the clipper does not handle BE vertex sizes - // different from FE. Increase vertexsize only for the cases that needed it - - // primid needs a slot - if (ctx->fs->info.base.uses_primid) - feState.vsVertexSize++; - // sprite coord enable - if (ctx->rasterizer->sprite_coord_enable) - feState.vsVertexSize++; - - if (ctx->rasterizer->flatshade_first) { - feState.provokingVertex = {1, 0, 0}; - } else { - feState.provokingVertex = {2, 1, 2}; - } - - enum pipe_prim_type topology; - if (ctx->gs) - topology = (pipe_prim_type)ctx->gs->info.base.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]; - else - topology = info->mode; - - switch (topology) { - case PIPE_PRIM_TRIANGLE_FAN: - feState.topologyProvokingVertex = feState.provokingVertex.triFan; - break; - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_TRIANGLES: - feState.topologyProvokingVertex = feState.provokingVertex.triStripList; - break; - case PIPE_PRIM_QUAD_STRIP: - case PIPE_PRIM_QUADS: - if (ctx->rasterizer->flatshade_first) - feState.topologyProvokingVertex = 0; - else - feState.topologyProvokingVertex = 3; - break; - case PIPE_PRIM_LINES: - case PIPE_PRIM_LINE_LOOP: - case PIPE_PRIM_LINE_STRIP: - feState.topologyProvokingVertex = feState.provokingVertex.lineStripList; - break; - default: - feState.topologyProvokingVertex = 0; - } - - feState.bEnableCutIndex = info->primitive_restart; - ctx->api.pfnSwrSetFrontendState(ctx->swrContext, &feState); - - if (info->index_size) - ctx->api.pfnSwrDrawIndexedInstanced(ctx->swrContext, - swr_convert_prim_topology(info->mode, ctx->patch_vertices), - draws[0].count, - info->instance_count, - draws[0].start, - draws->index_bias, - info->start_instance); - else - ctx->api.pfnSwrDrawInstanced(ctx->swrContext, - swr_convert_prim_topology(info->mode, ctx->patch_vertices), - draws[0].count, - info->instance_count, - draws[0].start, - info->start_instance); - - /* On client-buffer draw, we used client buffer directly, without - * copy. Block until draw is finished. - * VMD is an example application that benefits from this. */ - if (ctx->dirty & SWR_BLOCK_CLIENT_DRAW) { - struct swr_screen *screen = swr_screen(pipe->screen); - swr_fence_submit(ctx, screen->flush_fence); - swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0); - } -} - - -static void -swr_flush(struct pipe_context *pipe, - struct pipe_fence_handle **fence, - unsigned flags) -{ - struct swr_context *ctx = swr_context(pipe); - struct swr_screen *screen = swr_screen(pipe->screen); - - for (int i=0; i < ctx->framebuffer.nr_cbufs; i++) { - struct pipe_surface *cb = ctx->framebuffer.cbufs[i]; - if (cb) { - swr_store_dirty_resource(pipe, cb->texture, SWR_TILE_RESOLVED); - } - } - if (ctx->framebuffer.zsbuf) { - swr_store_dirty_resource(pipe, ctx->framebuffer.zsbuf->texture, - SWR_TILE_RESOLVED); - } - - if (fence) - swr_fence_reference(pipe->screen, fence, screen->flush_fence); -} - -void -swr_finish(struct pipe_context *pipe) -{ - struct pipe_fence_handle *fence = nullptr; - - swr_flush(pipe, &fence, 0); - swr_fence_finish(pipe->screen, NULL, fence, 0); - swr_fence_reference(pipe->screen, &fence, NULL); -} - -/* - * Invalidate tiles so they can be reloaded back when needed - */ -void -swr_invalidate_render_target(struct pipe_context *pipe, - uint32_t attachment, - uint16_t width, uint16_t height) -{ - struct swr_context *ctx = swr_context(pipe); - - /* grab the rect from the passed in arguments */ - swr_update_draw_context(ctx); - SWR_RECT full_rect = - {0, 0, (int32_t)width, (int32_t)height}; - ctx->api.pfnSwrInvalidateTiles(ctx->swrContext, - 1 << attachment, - full_rect); -} - - -/* - * Store SWR HotTiles back to renderTarget surface. - */ -void -swr_store_render_target(struct pipe_context *pipe, - uint32_t attachment, - enum SWR_TILE_STATE post_tile_state) -{ - struct swr_context *ctx = swr_context(pipe); - struct swr_draw_context *pDC = &ctx->swrDC; - struct SWR_SURFACE_STATE *renderTarget = &pDC->renderTargets[attachment]; - - /* Only proceed if there's a valid surface to store to */ - if (renderTarget->xpBaseAddress) { - swr_update_draw_context(ctx); - SWR_RECT full_rect = - {0, 0, - (int32_t)u_minify(renderTarget->width, renderTarget->lod), - (int32_t)u_minify(renderTarget->height, renderTarget->lod)}; - ctx->api.pfnSwrStoreTiles(ctx->swrContext, - 1 << attachment, - post_tile_state, - full_rect); - } -} - -void -swr_store_dirty_resource(struct pipe_context *pipe, - struct pipe_resource *resource, - enum SWR_TILE_STATE post_tile_state) -{ - /* Only store resource if it has been written to */ - if (swr_resource(resource)->status & SWR_RESOURCE_WRITE) { - struct swr_context *ctx = swr_context(pipe); - struct swr_screen *screen = swr_screen(pipe->screen); - struct swr_resource *spr = swr_resource(resource); - - swr_draw_context *pDC = &ctx->swrDC; - SWR_SURFACE_STATE *renderTargets = pDC->renderTargets; - for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; i++) - if (renderTargets[i].xpBaseAddress == spr->swr.xpBaseAddress || - (spr->secondary.xpBaseAddress && - renderTargets[i].xpBaseAddress == spr->secondary.xpBaseAddress)) { - swr_store_render_target(pipe, i, post_tile_state); - - /* Mesa thinks depth/stencil are fused, so we'll never get an - * explicit resource for stencil. So, if checking depth, then - * also check for stencil. */ - if (spr->has_stencil && (i == SWR_ATTACHMENT_DEPTH)) { - swr_store_render_target( - pipe, SWR_ATTACHMENT_STENCIL, post_tile_state); - } - - /* This fence signals StoreTiles completion */ - swr_fence_submit(ctx, screen->flush_fence); - - break; - } - } -} - -void -swr_draw_init(struct pipe_context *pipe) -{ - pipe->draw_vbo = swr_draw_vbo; - pipe->flush = swr_flush; -} diff --git a/src/gallium/drivers/swr/swr_fence.cpp b/src/gallium/drivers/swr/swr_fence.cpp deleted file mode 100644 index 4e2b2af874c..00000000000 --- a/src/gallium/drivers/swr/swr_fence.cpp +++ /dev/null @@ -1,159 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include "pipe/p_screen.h" -#include "util/u_memory.h" -#include "util/os_time.h" - -#include "swr_context.h" -#include "swr_screen.h" -#include "swr_fence.h" - -#ifdef __APPLE__ -#include <sched.h> -#endif - -#if defined(PIPE_CC_MSVC) // portable thread yield - #define sched_yield SwitchToThread -#endif - -/* - * Fence callback, called by back-end thread on completion of all rendering up - * to SwrSync call. - */ -static void -swr_fence_cb(uint64_t userData, uint64_t userData2, uint64_t userData3) -{ - struct swr_fence *fence = (struct swr_fence *)userData; - - /* Complete all work attached to the fence */ - swr_fence_do_work(fence); - - /* Correct value is in SwrSync data, and not the fence write field. */ - /* Contexts may not finish in order, but fence value always increases */ - if (fence->read < userData2) - fence->read = userData2; -} - -/* - * Submit an existing fence. - */ -void -swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fh) -{ - struct swr_fence *fence = swr_fence(fh); - - fence->write++; - fence->pending = TRUE; - ctx->api.pfnSwrSync(ctx->swrContext, swr_fence_cb, (uint64_t)fence, fence->write, 0); -} - -/* - * Create a new fence object. - */ -struct pipe_fence_handle * -swr_fence_create() -{ - static int fence_id = 0; - struct swr_fence *fence = CALLOC_STRUCT(swr_fence); - if (!fence) - return NULL; - - pipe_reference_init(&fence->reference, 1); - fence->id = fence_id++; - fence->work.tail = &fence->work.head; - - return (struct pipe_fence_handle *)fence; -} - -/** Destroy a fence. Called when refcount hits zero. */ -static void -swr_fence_destroy(struct swr_fence *fence) -{ - /* Complete any work left if fence was not submitted */ - swr_fence_do_work(fence); - FREE(fence); -} - -/** - * Set ptr = fence, with reference counting - */ -void -swr_fence_reference(struct pipe_screen *screen, - struct pipe_fence_handle **ptr, - struct pipe_fence_handle *f) -{ - struct swr_fence *fence = swr_fence(f); - struct swr_fence *old; - - if (likely(ptr)) { - old = swr_fence(*ptr); - *ptr = f; - } else { - old = NULL; - } - - if (pipe_reference(&old->reference, &fence->reference)) { - swr_fence_finish(screen, NULL, (struct pipe_fence_handle *) old, 0); - swr_fence_destroy(old); - } -} - - -/* - * Wait for the fence to finish. - */ -bool -swr_fence_finish(struct pipe_screen *screen, - struct pipe_context *ctx, - struct pipe_fence_handle *fence_handle, - uint64_t timeout) -{ - while (!swr_is_fence_done(fence_handle)) - sched_yield(); - - swr_fence(fence_handle)->pending = FALSE; - - return TRUE; -} - - -uint64_t -swr_get_timestamp(struct pipe_screen *screen) -{ - return os_time_get_nano(); -} - - -void -swr_fence_init(struct pipe_screen *p_screen) -{ - p_screen->fence_reference = swr_fence_reference; - p_screen->fence_finish = swr_fence_finish; - p_screen->get_timestamp = swr_get_timestamp; - - /* Create persistant StoreTiles "flush" fence, used to signal completion - * of flushing tile state back to resource texture, via StoreTiles. */ - struct swr_screen *screen = swr_screen(p_screen); - screen->flush_fence = swr_fence_create(); -} diff --git a/src/gallium/drivers/swr/swr_fence.h b/src/gallium/drivers/swr/swr_fence.h deleted file mode 100644 index 2f7cd1cf9a6..00000000000 --- a/src/gallium/drivers/swr/swr_fence.h +++ /dev/null @@ -1,89 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN - * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - ***************************************************************************/ - -#ifndef SWR_FENCE_H -#define SWR_FENCE_H - -#include "pipe/p_state.h" -#include "util/u_inlines.h" - -#include "swr_fence_work.h" - -struct pipe_screen; - -struct swr_fence { - struct pipe_reference reference; - - uint64_t read; - uint64_t write; - - unsigned pending; - - unsigned id; /* Just for reference */ - - struct { - uint32_t count; - struct swr_fence_work head; - struct swr_fence_work *tail; - } work; -}; - - -static inline struct swr_fence * -swr_fence(struct pipe_fence_handle *fence) -{ - return (struct swr_fence *)fence; -} - - -static INLINE bool -swr_is_fence_done(struct pipe_fence_handle *fence_handle) -{ - struct swr_fence *fence = swr_fence(fence_handle); - return (fence->read == fence->write); -} - -static INLINE bool -swr_is_fence_pending(struct pipe_fence_handle *fence_handle) -{ - return swr_fence(fence_handle)->pending; -} - - -void swr_fence_init(struct pipe_screen *screen); - -struct pipe_fence_handle *swr_fence_create(); - -void swr_fence_reference(struct pipe_screen *screen, - struct pipe_fence_handle **ptr, - struct pipe_fence_handle *f); - -bool swr_fence_finish(struct pipe_screen *screen, - struct pipe_context *ctx, - struct pipe_fence_handle *fence_handle, - uint64_t timeout); - -void -swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fence); - -uint64_t swr_get_timestamp(struct pipe_screen *screen); - -#endif diff --git a/src/gallium/drivers/swr/swr_fence_work.cpp b/src/gallium/drivers/swr/swr_fence_work.cpp deleted file mode 100644 index 6df55666a36..00000000000 --- a/src/gallium/drivers/swr/swr_fence_work.cpp +++ /dev/null @@ -1,213 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include "swr_context.h" -#include "swr_fence.h" - -#include "util/u_inlines.h" -#include "util/u_memory.h" - -/* - * Called by swr_fence_cb to complete the work queue - */ -void -swr_fence_do_work(struct swr_fence *fence) -{ - struct swr_fence_work *work, *tmp; - - if (fence->work.head.next) { - work = fence->work.head.next; - /* Immediately clear the head so any new work gets added to a new work - * queue */ - p_atomic_set(&fence->work.head.next, 0); - p_atomic_set(&fence->work.tail, &fence->work.head); - p_atomic_set(&fence->work.count, 0); - - do { - tmp = work->next; - work->callback(work); - FREE(work); - work = tmp; - } while(work); - } -} - - -/* - * Called by one of the specialized work routines below - */ -static inline void -swr_add_fence_work(struct pipe_fence_handle *fh, - struct swr_fence_work *work) -{ - /* If no fence, just do the work now */ - if (!fh) { - work->callback(work); - FREE(work); - return; - } - - struct swr_fence *fence = swr_fence(fh); - p_atomic_set(&fence->work.tail->next, work); - p_atomic_set(&fence->work.tail, work); - p_atomic_inc(&fence->work.count); -} - - -/* - * Generic free/free_aligned, and delete vs/fs - */ -template<bool aligned_free> -static void -swr_free_cb(struct swr_fence_work *work) -{ - if (aligned_free) - AlignedFree(work->free.data); - else - FREE(work->free.data); -} - -static void -swr_delete_vs_cb(struct swr_fence_work *work) -{ - delete work->free.swr_vs; -} - -static void -swr_delete_fs_cb(struct swr_fence_work *work) -{ - delete work->free.swr_fs; -} - -static void -swr_delete_gs_cb(struct swr_fence_work *work) -{ - delete work->free.swr_gs; -} - -static void -swr_delete_tcs_cb(struct swr_fence_work *work) -{ - delete work->free.swr_tcs; -} - -static void -swr_delete_tes_cb(struct swr_fence_work *work) -{ - delete work->free.swr_tes; -} - - -bool -swr_fence_work_free(struct pipe_fence_handle *fence, void *data, - bool aligned_free) -{ - struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work); - if (!work) - return false; - if (aligned_free) - work->callback = swr_free_cb<true>; - else - work->callback = swr_free_cb<false>; - work->free.data = data; - - swr_add_fence_work(fence, work); - - return true; -} - -bool -swr_fence_work_delete_vs(struct pipe_fence_handle *fence, - struct swr_vertex_shader *swr_vs) -{ - struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work); - if (!work) - return false; - work->callback = swr_delete_vs_cb; - work->free.swr_vs = swr_vs; - - swr_add_fence_work(fence, work); - - return true; -} - -bool -swr_fence_work_delete_fs(struct pipe_fence_handle *fence, - struct swr_fragment_shader *swr_fs) -{ - struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work); - if (!work) - return false; - work->callback = swr_delete_fs_cb; - work->free.swr_fs = swr_fs; - - swr_add_fence_work(fence, work); - - return true; -} - -bool -swr_fence_work_delete_gs(struct pipe_fence_handle *fence, - struct swr_geometry_shader *swr_gs) -{ - struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work); - if (!work) - return false; - work->callback = swr_delete_gs_cb; - work->free.swr_gs = swr_gs; - - swr_add_fence_work(fence, work); - - return true; -} - -bool -swr_fence_work_delete_tcs(struct pipe_fence_handle *fence, - struct swr_tess_control_shader *swr_tcs) -{ - struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work); - if (!work) - return false; - work->callback = swr_delete_tcs_cb; - work->free.swr_tcs = swr_tcs; - - swr_add_fence_work(fence, work); - - return true; -} - - -bool -swr_fence_work_delete_tes(struct pipe_fence_handle *fence, - struct swr_tess_evaluation_shader *swr_tes) -{ - struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work); - if (!work) - return false; - work->callback = swr_delete_tes_cb; - work->free.swr_tes = swr_tes; - - swr_add_fence_work(fence, work); - - return true; -}
\ No newline at end of file diff --git a/src/gallium/drivers/swr/swr_fence_work.h b/src/gallium/drivers/swr/swr_fence_work.h deleted file mode 100644 index ab411599ca5..00000000000 --- a/src/gallium/drivers/swr/swr_fence_work.h +++ /dev/null @@ -1,56 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN - * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - ***************************************************************************/ - -#ifndef SWR_FENCE_WORK_H -#define SWR_FENCE_WORK_H - -typedef void(*SWR_WORK_CALLBACK_FUNC)(struct swr_fence_work *work); - -struct swr_fence_work { - SWR_WORK_CALLBACK_FUNC callback; - - union { - void *data; - struct swr_vertex_shader *swr_vs; - struct swr_fragment_shader *swr_fs; - struct swr_geometry_shader *swr_gs; - struct swr_tess_control_shader *swr_tcs; - struct swr_tess_evaluation_shader *swr_tes; - } free; - - struct swr_fence_work *next; -}; - -void swr_fence_do_work(struct swr_fence *fence); - -bool swr_fence_work_free(struct pipe_fence_handle *fence, void *data, - bool aligned_free = false); -bool swr_fence_work_delete_vs(struct pipe_fence_handle *fence, - struct swr_vertex_shader *swr_vs); -bool swr_fence_work_delete_fs(struct pipe_fence_handle *fence, - struct swr_fragment_shader *swr_vs); -bool swr_fence_work_delete_gs(struct pipe_fence_handle *fence, - struct swr_geometry_shader *swr_gs); -bool swr_fence_work_delete_tcs(struct pipe_fence_handle *fence, - struct swr_tess_control_shader *swr_tcs); -bool swr_fence_work_delete_tes(struct pipe_fence_handle *fence, - struct swr_tess_evaluation_shader *swr_tes); -#endif diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp deleted file mode 100644 index 1fb14e636d7..00000000000 --- a/src/gallium/drivers/swr/swr_loader.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2016 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include "memory/InitMemory.h" -#include "util/u_cpu_detect.h" -#include "util/u_dl.h" -#include "swr_public.h" -#include "swr_screen.h" - -#include <stdio.h> - -// Helper function to resolve the backend filename based on architecture -static bool -swr_initialize_screen_interface(struct swr_screen *screen, const char arch[]) -{ -#ifdef HAVE_SWR_BUILTIN - screen->pLibrary = NULL; - screen->pfnSwrGetInterface = SwrGetInterface; - screen->pfnSwrGetTileInterface = SwrGetTileIterface; - InitTilesTable(); - swr_print_info("(using: builtin).\n"); -#else - char filename[256] = { 0 }; - sprintf(filename, "%sswr%s%s", UTIL_DL_PREFIX, arch, UTIL_DL_EXT); - - screen->pLibrary = util_dl_open(filename); - if (!screen->pLibrary) { - fprintf(stderr, "(skipping: %s).\n", util_dl_error()); - return false; - } - - util_dl_proc pApiProc = util_dl_get_proc_address(screen->pLibrary, - "SwrGetInterface"); - util_dl_proc pTileApiProc = util_dl_get_proc_address(screen->pLibrary, - "SwrGetTileIterface"); - util_dl_proc pInitFunc = util_dl_get_proc_address(screen->pLibrary, - "InitTilesTable"); - if (!pApiProc || !pInitFunc || !pTileApiProc) { - fprintf(stderr, "(skipping: %s).\n", util_dl_error()); - util_dl_close(screen->pLibrary); - screen->pLibrary = NULL; - return false; - } - - screen->pfnSwrGetInterface = (PFNSwrGetInterface)pApiProc; - screen->pfnSwrGetTileInterface = (PFNSwrGetTileInterface)pTileApiProc; - - SWR_ASSERT(screen->pfnSwrGetInterface != nullptr); - SWR_ASSERT(screen->pfnSwrGetTileInterface != nullptr); - SWR_ASSERT(pInitFunc != nullptr); - - pInitFunc(); - - swr_print_info("(using: %s).\n", filename); -#endif - - return true; -} - - -struct pipe_screen * -swr_create_screen(struct sw_winsys *winsys) -{ - struct pipe_screen *p_screen = swr_create_screen_internal(winsys); - if (!p_screen) { - return NULL; - } - - struct swr_screen *screen = swr_screen(p_screen); - screen->is_knl = false; - - util_cpu_detect(); - - if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512er) { - swr_print_info("SWR detected KNL instruction support "); -#ifndef HAVE_SWR_KNL - swr_print_info("(skipping: not built).\n"); -#else - if (swr_initialize_screen_interface(screen, "KNL")) { - screen->is_knl = true; - return p_screen; - } -#endif - } - - if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512bw) { - swr_print_info("SWR detected SKX instruction support "); -#ifndef HAVE_SWR_SKX - swr_print_info("(skipping not built).\n"); -#else - if (swr_initialize_screen_interface(screen, "SKX")) - return p_screen; -#endif - } - - if (util_get_cpu_caps()->has_avx2) { - swr_print_info("SWR detected AVX2 instruction support "); -#ifndef HAVE_SWR_AVX2 - swr_print_info("(skipping not built).\n"); -#else - if (swr_initialize_screen_interface(screen, "AVX2")) - return p_screen; -#endif - } - - if (util_get_cpu_caps()->has_avx) { - swr_print_info("SWR detected AVX instruction support "); -#ifndef HAVE_SWR_AVX - swr_print_info("(skipping not built).\n"); -#else - if (swr_initialize_screen_interface(screen, "AVX")) - return p_screen; -#endif - } - - fprintf(stderr, "SWR could not initialize a supported CPU architecture.\n"); - swr_destroy_screen_internal(&screen); - - return NULL; -} - - -#ifdef _WIN32 -// swap function called from libl_gdi.c - -void -swr_gdi_swap(struct pipe_screen *screen, - struct pipe_context *ctx, - struct pipe_resource *res, - void *hDC) -{ - screen->flush_frontbuffer(screen, - ctx, - res, - 0, 0, - hDC, - NULL); -} - -#endif /* _WIN32 */ diff --git a/src/gallium/drivers/swr/swr_memory.h b/src/gallium/drivers/swr/swr_memory.h deleted file mode 100644 index bf6eaa34758..00000000000 --- a/src/gallium/drivers/swr/swr_memory.h +++ /dev/null @@ -1,61 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#pragma once -#include "rasterizer/core/context.h" -INLINE void -swr_LoadHotTile(HANDLE hDC, - HANDLE hWorkerPrivateData, - SWR_FORMAT dstFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - UINT x, UINT y, - uint32_t renderTargetArrayIndex, uint8_t* pDstHotTile) -{ - DRAW_CONTEXT *pDC = (DRAW_CONTEXT*)hDC; - swr_draw_context *pSDC = (swr_draw_context*)GetPrivateState(pDC); - SWR_SURFACE_STATE *pSrcSurface = &pSDC->renderTargets[renderTargetIndex]; - - pSDC->pTileAPI->pfnSwrLoadHotTile(hWorkerPrivateData, pSrcSurface, pDC->pContext->pBucketMgr, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile); -} - -INLINE void -swr_StoreHotTile(HANDLE hDC, - HANDLE hWorkerPrivateData, - SWR_FORMAT srcFormat, - SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - UINT x, UINT y, - uint32_t renderTargetArrayIndex, uint8_t* pSrcHotTile) -{ - DRAW_CONTEXT *pDC = (DRAW_CONTEXT*)hDC; - swr_draw_context *pSDC = (swr_draw_context*)GetPrivateState(pDC); - SWR_SURFACE_STATE *pDstSurface = &pSDC->renderTargets[renderTargetIndex]; - - pSDC->pTileAPI->pfnSwrStoreHotTileToSurface(hWorkerPrivateData, pDstSurface, pDC->pContext->pBucketMgr, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile); -} - -INLINE gfxptr_t -swr_MakeGfxPtr(HANDLE hPrivateContext, void* sysAddr) -{ - // Fulfill an unused internal interface - return (gfxptr_t)sysAddr; -} diff --git a/src/gallium/drivers/swr/swr_public.h b/src/gallium/drivers/swr/swr_public.h deleted file mode 100644 index 2a7d2984cb3..00000000000 --- a/src/gallium/drivers/swr/swr_public.h +++ /dev/null @@ -1,57 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#ifndef SWR_PUBLIC_H -#define SWR_PUBLIC_H - -struct pipe_screen; -struct pipe_context; -struct sw_displaytarget; -struct sw_winsys; -struct swr_screen; - -#ifdef __cplusplus -extern "C" { -#endif - -// driver entry point -struct pipe_screen *swr_create_screen(struct sw_winsys *winsys); - -// arch-specific dll entry point -struct pipe_screen *swr_create_screen_internal(struct sw_winsys *winsys); - -// cleanup for failed screen creation -void swr_destroy_screen_internal(struct swr_screen **screen); - -#ifdef _WIN32 -void swr_gdi_swap(struct pipe_screen *screen, - struct pipe_context *ctx, - struct pipe_resource *res, - void *hDC); -#endif /* _WIN32 */ - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp deleted file mode 100644 index 005b64fb090..00000000000 --- a/src/gallium/drivers/swr/swr_query.cpp +++ /dev/null @@ -1,272 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include "pipe/p_defines.h" -#include "util/u_memory.h" -#include "util/os_time.h" -#include "swr_context.h" -#include "swr_fence.h" -#include "swr_query.h" -#include "swr_screen.h" -#include "swr_state.h" -#include "common/os.h" - -static struct swr_query * -swr_query(struct pipe_query *p) -{ - return (struct swr_query *)p; -} - -static struct pipe_query * -swr_create_query(struct pipe_context *pipe, unsigned type, unsigned index) -{ - struct swr_query *pq; - - assert(type < PIPE_QUERY_TYPES); - assert(index < MAX_SO_STREAMS); - - pq = (struct swr_query *) AlignedMalloc(sizeof(struct swr_query), 64); - - if (pq) { - memset(pq, 0, sizeof(*pq)); - pq->type = type; - pq->index = index; - } - - return (struct pipe_query *)pq; -} - - -static void -swr_destroy_query(struct pipe_context *pipe, struct pipe_query *q) -{ - struct swr_query *pq = swr_query(q); - - if (pq->fence) { - if (swr_is_fence_pending(pq->fence)) - swr_fence_finish(pipe->screen, NULL, pq->fence, 0); - swr_fence_reference(pipe->screen, &pq->fence, NULL); - } - - AlignedFree(pq); -} - - -static bool -swr_get_query_result(struct pipe_context *pipe, - struct pipe_query *q, - bool wait, - union pipe_query_result *result) -{ - struct swr_query *pq = swr_query(q); - unsigned index = pq->index; - - if (pq->fence) { - if (!wait && !swr_is_fence_done(pq->fence)) - return false; - - swr_fence_finish(pipe->screen, NULL, pq->fence, 0); - swr_fence_reference(pipe->screen, &pq->fence, NULL); - } - - /* All values are reset to 0 at swr_begin_query, except starting timestamp. - * Counters become simply end values. */ - switch (pq->type) { - /* Booleans */ - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - result->b = pq->result.core.DepthPassCount != 0; - break; - case PIPE_QUERY_GPU_FINISHED: - result->b = true; - break; - /* Counters */ - case PIPE_QUERY_OCCLUSION_COUNTER: - result->u64 = pq->result.core.DepthPassCount; - break; - case PIPE_QUERY_TIMESTAMP: - case PIPE_QUERY_TIME_ELAPSED: - result->u64 = pq->result.timestamp_end - pq->result.timestamp_start; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - result->u64 = pq->result.coreFE.IaPrimitives; - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - result->u64 = pq->result.coreFE.SoNumPrimsWritten[index]; - break; - /* Structures */ - case PIPE_QUERY_SO_STATISTICS: { - struct pipe_query_data_so_statistics *so_stats = &result->so_statistics; - so_stats->num_primitives_written = - pq->result.coreFE.SoNumPrimsWritten[index]; - so_stats->primitives_storage_needed = - pq->result.coreFE.SoPrimStorageNeeded[index]; - } break; - case PIPE_QUERY_TIMESTAMP_DISJOINT: - /* os_get_time_nano returns nanoseconds */ - result->timestamp_disjoint.frequency = UINT64_C(1000000000); - result->timestamp_disjoint.disjoint = FALSE; - break; - case PIPE_QUERY_PIPELINE_STATISTICS: { - struct pipe_query_data_pipeline_statistics *p_stats = - &result->pipeline_statistics; - p_stats->ia_vertices = pq->result.coreFE.IaVertices; - p_stats->ia_primitives = pq->result.coreFE.IaPrimitives; - p_stats->vs_invocations = pq->result.coreFE.VsInvocations; - p_stats->gs_invocations = pq->result.coreFE.GsInvocations; - p_stats->gs_primitives = pq->result.coreFE.GsPrimitives; - p_stats->c_invocations = pq->result.coreFE.CPrimitives; - p_stats->c_primitives = pq->result.coreFE.CPrimitives; - p_stats->ps_invocations = pq->result.core.PsInvocations; - p_stats->hs_invocations = pq->result.coreFE.HsInvocations; - p_stats->ds_invocations = pq->result.coreFE.DsInvocations; - p_stats->cs_invocations = pq->result.core.CsInvocations; - } break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: { - uint64_t num_primitives_written = - pq->result.coreFE.SoNumPrimsWritten[index]; - uint64_t primitives_storage_needed = - pq->result.coreFE.SoPrimStorageNeeded[index]; - result->b = num_primitives_written > primitives_storage_needed; - } - break; - default: - assert(0 && "Unsupported query"); - break; - } - - return true; -} - -static bool -swr_begin_query(struct pipe_context *pipe, struct pipe_query *q) -{ - struct swr_context *ctx = swr_context(pipe); - struct swr_query *pq = swr_query(q); - - /* Initialize Results */ - memset(&pq->result, 0, sizeof(pq->result)); - switch (pq->type) { - case PIPE_QUERY_GPU_FINISHED: - case PIPE_QUERY_TIMESTAMP: - /* nothing to do, but don't want the default */ - break; - case PIPE_QUERY_TIME_ELAPSED: - pq->result.timestamp_start = swr_get_timestamp(pipe->screen); - break; - default: - /* Core counters required. Update draw context with location to - * store results. */ - swr_update_draw_context(ctx, &pq->result); - - /* Only change stat collection if there are no active queries */ - if (ctx->active_queries == 0) { - ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, TRUE); - ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, TRUE); - } - ctx->active_queries++; - break; - } - - - return true; -} - -static bool -swr_end_query(struct pipe_context *pipe, struct pipe_query *q) -{ - struct swr_context *ctx = swr_context(pipe); - struct swr_query *pq = swr_query(q); - - switch (pq->type) { - case PIPE_QUERY_GPU_FINISHED: - /* nothing to do, but don't want the default */ - break; - case PIPE_QUERY_TIMESTAMP: - case PIPE_QUERY_TIME_ELAPSED: - pq->result.timestamp_end = swr_get_timestamp(pipe->screen); - break; - default: - /* Stats are updated asynchronously, a fence is used to signal - * completion. */ - if (!pq->fence) { - struct swr_screen *screen = swr_screen(pipe->screen); - swr_fence_reference(pipe->screen, &pq->fence, screen->flush_fence); - } - swr_fence_submit(ctx, pq->fence); - - /* Only change stat collection if there are no active queries */ - ctx->active_queries--; - if (ctx->active_queries == 0) { - ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, FALSE); - ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, FALSE); - } - - break; - } - - return true; -} - - -bool -swr_check_render_cond(struct pipe_context *pipe) -{ - struct swr_context *ctx = swr_context(pipe); - bool b, wait; - uint64_t result; - - if (!ctx->render_cond_query) - return true; /* no query predicate, draw normally */ - - wait = (ctx->render_cond_mode == PIPE_RENDER_COND_WAIT - || ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT); - - b = pipe->get_query_result( - pipe, ctx->render_cond_query, wait, (union pipe_query_result *)&result); - if (b) - return ((!result) == ctx->render_cond_cond); - else - return true; -} - - -static void -swr_set_active_query_state(struct pipe_context *pipe, bool enable) -{ -} - -void -swr_query_init(struct pipe_context *pipe) -{ - struct swr_context *ctx = swr_context(pipe); - - pipe->create_query = swr_create_query; - pipe->destroy_query = swr_destroy_query; - pipe->begin_query = swr_begin_query; - pipe->end_query = swr_end_query; - pipe->get_query_result = swr_get_query_result; - pipe->set_active_query_state = swr_set_active_query_state; - - ctx->active_queries = 0; -} diff --git a/src/gallium/drivers/swr/swr_query.h b/src/gallium/drivers/swr/swr_query.h deleted file mode 100644 index d838dc859e2..00000000000 --- a/src/gallium/drivers/swr/swr_query.h +++ /dev/null @@ -1,48 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#ifndef SWR_QUERY_H -#define SWR_QUERY_H - - -#include <limits.h> - -struct swr_query_result { - SWR_STATS core; - SWR_STATS_FE coreFE; - uint64_t timestamp_start; - uint64_t timestamp_end; -}; - -OSALIGNLINE(struct) swr_query { - unsigned type; /* PIPE_QUERY_* */ - unsigned index; - - struct swr_query_result result; - struct pipe_fence_handle *fence; -}; - -extern void swr_query_init(struct pipe_context *pipe); - -extern bool swr_check_render_cond(struct pipe_context *pipe); -#endif diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h deleted file mode 100644 index 2228dff7488..00000000000 --- a/src/gallium/drivers/swr/swr_resource.h +++ /dev/null @@ -1,145 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#ifndef SWR_RESOURCE_H -#define SWR_RESOURCE_H - -#include "memory/SurfaceState.h" -#include "pipe/p_state.h" -#include "api.h" - -struct sw_displaytarget; - -enum swr_resource_status { - SWR_RESOURCE_UNUSED = 0x0, - SWR_RESOURCE_READ = 0x1, - SWR_RESOURCE_WRITE = 0x2, -}; - -struct swr_resource { - struct pipe_resource base; - - bool has_depth; - bool has_stencil; - - SWR_SURFACE_STATE swr; - SWR_SURFACE_STATE secondary; /* for faking depth/stencil merged formats */ - - struct sw_displaytarget *display_target; - - /* If resource is multisample, then this points to a alternate resource - * containing the resolved multisample surface, otherwise null */ - struct pipe_resource *resolve_target; - - size_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; - size_t secondary_mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; - - enum swr_resource_status status; - - /* last pipe that used (validated) this resource */ - struct pipe_context *curr_pipe; -}; - - -static INLINE struct swr_resource * -swr_resource(struct pipe_resource *resource) -{ - return (struct swr_resource *)resource; -} - -static INLINE bool -swr_resource_is_texture(const struct pipe_resource *resource) -{ - switch (resource->target) { - case PIPE_BUFFER: - return false; - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_2D_ARRAY: - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_3D: - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - return true; - default: - assert(0); - return false; - } -} - - -static INLINE uint8_t * -swr_resource_data(struct pipe_resource *resource) -{ - struct swr_resource *swr_r = swr_resource(resource); - - assert(!swr_resource_is_texture(resource)); - - return (uint8_t*)(swr_r->swr.xpBaseAddress); -} - - -void swr_invalidate_render_target(struct pipe_context *pipe, - uint32_t attachment, - uint16_t width, uint16_t height); - -void swr_store_render_target(struct pipe_context *pipe, - uint32_t attachment, - enum SWR_TILE_STATE post_tile_state); - -void swr_store_dirty_resource(struct pipe_context *pipe, - struct pipe_resource *resource, - enum SWR_TILE_STATE post_tile_state); - -void swr_update_resource_status(struct pipe_context *, - const struct pipe_draw_info *); - -/* - * Functions to indicate a resource's in-use status. - */ -static INLINE enum -swr_resource_status & operator|=(enum swr_resource_status & a, - enum swr_resource_status b) { - return (enum swr_resource_status &)((int&)a |= (int)b); -} - -static INLINE void -swr_resource_read(struct pipe_resource *resource) -{ - swr_resource(resource)->status |= SWR_RESOURCE_READ; -} - -static INLINE void -swr_resource_write(struct pipe_resource *resource) -{ - swr_resource(resource)->status |= SWR_RESOURCE_WRITE; -} - -static INLINE void -swr_resource_unused(struct pipe_resource *resource) -{ - swr_resource(resource)->status = SWR_RESOURCE_UNUSED; -} - -#endif diff --git a/src/gallium/drivers/swr/swr_scratch.cpp b/src/gallium/drivers/swr/swr_scratch.cpp deleted file mode 100644 index 66f18365cc7..00000000000 --- a/src/gallium/drivers/swr/swr_scratch.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include "util/u_memory.h" -#include "swr_context.h" -#include "swr_screen.h" -#include "swr_scratch.h" -#include "swr_fence.h" -#include "swr_fence_work.h" -#include "api.h" - -void * -swr_copy_to_scratch_space(struct swr_context *ctx, - struct swr_scratch_space *space, - const void *user_buffer, - unsigned int size) -{ - void *ptr; - assert(space); - assert(size); - - /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */ - uint32_t max_size_in_flight = size * ctx->max_draws_in_flight; - - /* Need to grow space */ - if (max_size_in_flight > space->current_size) { - space->current_size = max_size_in_flight; - - if (space->base) { - /* defer delete, use aligned-free, fence finish enforces the defer - * delete will be on the *next* fence */ - struct swr_screen *screen = swr_screen(ctx->pipe.screen); - swr_fence_finish(ctx->pipe.screen, NULL, screen->flush_fence, 0); - swr_fence_work_free(screen->flush_fence, space->base, true); - space->base = NULL; - } - - if (!space->base) { - space->base = (uint8_t *)AlignedMalloc(space->current_size, - sizeof(void *)); - space->head = (void *)space->base; - } - } - - /* Wrap */ - if (((uint8_t *)space->head + size) - >= ((uint8_t *)space->base + space->current_size)) { - space->head = space->base; - } - - ptr = space->head; - space->head = (uint8_t *)space->head + size; - - /* Copy user_buffer to scratch */ - if (user_buffer) - memcpy(ptr, user_buffer, size); - - return ptr; -} - - -void -swr_init_scratch_buffers(struct swr_context *ctx) -{ - struct swr_scratch_buffers *scratch; - - scratch = CALLOC_STRUCT(swr_scratch_buffers); - ctx->scratch = scratch; -} - -void -swr_destroy_scratch_buffers(struct swr_context *ctx) -{ - struct swr_scratch_buffers *scratch = ctx->scratch; - - if (scratch) { - AlignedFree(scratch->vs_constants.base); - AlignedFree(scratch->fs_constants.base); - AlignedFree(scratch->gs_constants.base); - AlignedFree(scratch->tcs_constants.base); - AlignedFree(scratch->tes_constants.base); - AlignedFree(scratch->vertex_buffer.base); - AlignedFree(scratch->index_buffer.base); - FREE(scratch); - } -} diff --git a/src/gallium/drivers/swr/swr_scratch.h b/src/gallium/drivers/swr/swr_scratch.h deleted file mode 100644 index 4d1c82fc6fc..00000000000 --- a/src/gallium/drivers/swr/swr_scratch.h +++ /dev/null @@ -1,66 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#ifndef SWR_SCRATCH_H -#define SWR_SCRATCH_H - -struct swr_scratch_space { - void *head; - unsigned int current_size; - /* TODO XXX: Add a fence for wrap condition. */ - - void *base; -}; - -struct swr_scratch_buffers { - struct swr_scratch_space vs_constants; - struct swr_scratch_space fs_constants; - struct swr_scratch_space gs_constants; - struct swr_scratch_space tcs_constants; - struct swr_scratch_space tes_constants; - struct swr_scratch_space vertex_buffer; - struct swr_scratch_space index_buffer; -}; - - -/* - * swr_copy_to_scratch_space - * Copies size bytes of user_buffer into the scratch ring buffer. - * Used to store temporary data such as client arrays and constants. - * - * Inputs: - * space ptr to scratch pool (vs_constants, fs_constants) - * user_buffer, data to copy into scratch space - * size to be copied - * Returns: - * pointer to data copied to scratch space. - */ -void *swr_copy_to_scratch_space(struct swr_context *ctx, - struct swr_scratch_space *space, - const void *user_buffer, - unsigned int size); - -void swr_init_scratch_buffers(struct swr_context *ctx); -void swr_destroy_scratch_buffers(struct swr_context *ctx); - -#endif diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp deleted file mode 100644 index 4c274fd86e5..00000000000 --- a/src/gallium/drivers/swr/swr_screen.cpp +++ /dev/null @@ -1,1155 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include "swr_context.h" -#include "swr_public.h" -#include "swr_screen.h" -#include "swr_resource.h" -#include "swr_fence.h" -#include "gen_knobs.h" - -#include "pipe/p_screen.h" -#include "pipe/p_defines.h" -#include "util/u_memory.h" -#include "util/format/u_format.h" -#include "util/u_inlines.h" -#include "util/u_cpu_detect.h" -#include "util/format/u_format_s3tc.h" -#include "util/u_string.h" -#include "util/u_screen.h" - -#include "frontend/sw_winsys.h" - -#include "jit_api.h" - -#include "memory/TilingFunctions.h" - -#include <stdio.h> -#include <map> - -/* - * Max texture sizes - * XXX Check max texture size values against core and sampler. - */ -#define SWR_MAX_TEXTURE_SIZE (2 * 1024 * 1024 * 1024ULL) /* 2GB */ -/* Not all texture formats can fit into 2GB limit, but we have to - live with that. See lp_limits.h for more details */ -#define SWR_MAX_TEXTURE_2D_SIZE 16384 -#define SWR_MAX_TEXTURE_3D_LEVELS 12 /* 2K x 2K x 2K for now */ -#define SWR_MAX_TEXTURE_CUBE_LEVELS 14 /* 8K x 8K for now */ -#define SWR_MAX_TEXTURE_ARRAY_LAYERS 512 /* 8K x 512 / 8K x 8K x 512 */ - -/* Default max client_copy_limit */ -#define SWR_CLIENT_COPY_LIMIT 8192 - -/* Flag indicates creation of alternate surface, to prevent recursive loop - * in resource creation when msaa_force_enable is set. */ -#define SWR_RESOURCE_FLAG_ALT_SURFACE (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) - - -static const char * -swr_get_name(struct pipe_screen *screen) -{ - static char buf[100]; - snprintf(buf, sizeof(buf), "SWR (LLVM " MESA_LLVM_VERSION_STRING ", %u bits)", - lp_native_vector_width); - return buf; -} - -static const char * -swr_get_vendor(struct pipe_screen *screen) -{ - return "Intel Corporation"; -} - -static bool -swr_is_format_supported(struct pipe_screen *_screen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned bind) -{ - struct swr_screen *screen = swr_screen(_screen); - struct sw_winsys *winsys = screen->winsys; - const struct util_format_description *format_desc; - - assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D - || target == PIPE_TEXTURE_1D_ARRAY - || target == PIPE_TEXTURE_2D - || target == PIPE_TEXTURE_2D_ARRAY - || target == PIPE_TEXTURE_RECT - || target == PIPE_TEXTURE_3D - || target == PIPE_TEXTURE_CUBE - || target == PIPE_TEXTURE_CUBE_ARRAY); - - if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) - return false; - - format_desc = util_format_description(format); - if (!format_desc) - return false; - - if ((sample_count > screen->msaa_max_count) - || !util_is_power_of_two_or_zero(sample_count)) - return false; - - if (bind & PIPE_BIND_DISPLAY_TARGET) { - if (!winsys->is_displaytarget_format_supported(winsys, bind, format)) - return false; - } - - if (bind & PIPE_BIND_RENDER_TARGET) { - if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) - return false; - - if (mesa_to_swr_format(format) == (SWR_FORMAT)-1) - return false; - - /* - * Although possible, it is unnatural to render into compressed or YUV - * surfaces. So disable these here to avoid going into weird paths - * inside gallium frontends. - */ - if (format_desc->block.width != 1 || format_desc->block.height != 1) - return false; - } - - if (bind & PIPE_BIND_DEPTH_STENCIL) { - if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) - return false; - - if (mesa_to_swr_format(format) == (SWR_FORMAT)-1) - return false; - } - - if (bind & PIPE_BIND_VERTEX_BUFFER) { - if (mesa_to_swr_format(format) == (SWR_FORMAT)-1) { - return false; - } - } - - if (format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC || - format_desc->layout == UTIL_FORMAT_LAYOUT_FXT1) - { - return false; - } - - if (format_desc->layout == UTIL_FORMAT_LAYOUT_ETC && - format != PIPE_FORMAT_ETC1_RGB8) { - return false; - } - - if ((bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW)) && - ((bind & PIPE_BIND_DISPLAY_TARGET) == 0)) { - /* Disable all 3-channel formats, where channel size != 32 bits. - * In some cases we run into crashes (in generate_unswizzled_blend()), - * for 3-channel RGB16 variants, there was an apparent LLVM bug. - * In any case, disabling the shallower 3-channel formats avoids a - * number of issues with GL_ARB_copy_image support. - */ - if (format_desc->is_array && - format_desc->nr_channels == 3 && - format_desc->block.bits != 96) { - return false; - } - } - - return TRUE; -} - -static int -swr_get_param(struct pipe_screen *screen, enum pipe_cap param) -{ - switch (param) { - /* limits */ - case PIPE_CAP_MAX_RENDER_TARGETS: - return PIPE_MAX_COLOR_BUFS; - case PIPE_CAP_MAX_TEXTURE_2D_SIZE: - return SWR_MAX_TEXTURE_2D_SIZE; - case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: - return SWR_MAX_TEXTURE_3D_LEVELS; - case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: - return SWR_MAX_TEXTURE_CUBE_LEVELS; - case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - return MAX_SO_STREAMS; - case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - return MAX_ATTRIBUTES * 4; - case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: - case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: - return 1024; - case PIPE_CAP_MAX_VERTEX_STREAMS: - return 4; - case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: - return 2048; - case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - return SWR_MAX_TEXTURE_ARRAY_LAYERS; - case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MIN_TEXEL_OFFSET: - return -8; - case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MAX_TEXEL_OFFSET: - return 7; - case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: - return 4; - case PIPE_CAP_GLSL_FEATURE_LEVEL: - return 330; - case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: - return 140; - case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: - return 16; - case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: - return 64; - case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: - return 65536; - case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: - return 1; - case PIPE_CAP_MAX_VIEWPORTS: - return KNOB_NUM_VIEWPORTS_SCISSORS; - case PIPE_CAP_ENDIANNESS: - return PIPE_ENDIAN_NATIVE; - - /* supported features */ - case PIPE_CAP_NPOT_TEXTURES: - case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: - case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: - case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: - case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: - case PIPE_CAP_VERTEX_SHADER_SATURATE: - case PIPE_CAP_POINT_SPRITE: - case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: - case PIPE_CAP_OCCLUSION_QUERY: - case PIPE_CAP_QUERY_TIME_ELAPSED: - case PIPE_CAP_QUERY_PIPELINE_STATISTICS: - case PIPE_CAP_TEXTURE_MIRROR_CLAMP: - case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE: - case PIPE_CAP_TEXTURE_SWIZZLE: - case PIPE_CAP_BLEND_EQUATION_SEPARATE: - case PIPE_CAP_INDEP_BLEND_ENABLE: - case PIPE_CAP_INDEP_BLEND_FUNC: - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: - case PIPE_CAP_DEPTH_CLIP_DISABLE: - case PIPE_CAP_PRIMITIVE_RESTART: - case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX: - case PIPE_CAP_TGSI_INSTANCEID: - case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: - case PIPE_CAP_START_INSTANCE: - case PIPE_CAP_SEAMLESS_CUBE_MAP: - case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - case PIPE_CAP_CONDITIONAL_RENDER: - case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: - case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: - case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: - case PIPE_CAP_USER_VERTEX_BUFFERS: - case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: - case PIPE_CAP_QUERY_TIMESTAMP: - case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: - case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: - case PIPE_CAP_DRAW_INDIRECT: - case PIPE_CAP_UMA: - case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: - case PIPE_CAP_CLIP_HALFZ: - case PIPE_CAP_POLYGON_OFFSET_CLAMP: - case PIPE_CAP_DEPTH_BOUNDS_TEST: - case PIPE_CAP_CLEAR_TEXTURE: - case PIPE_CAP_TEXTURE_FLOAT_LINEAR: - case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: - case PIPE_CAP_CULL_DISTANCE: - case PIPE_CAP_CUBE_MAP_ARRAY: - case PIPE_CAP_DOUBLES: - case PIPE_CAP_TEXTURE_QUERY_LOD: - case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: - case PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE: - case PIPE_CAP_QUERY_SO_OVERFLOW: - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: - case PIPE_CAP_IMAGE_STORE_FORMATTED: - return 1; - - case PIPE_CAP_SHAREABLE_SHADERS: - return 0; - - /* MSAA support - * If user has explicitly set max_sample_count = 1 (via SWR_MSAA_MAX_COUNT) - * then disable all MSAA support and go back to old (FAKE_SW_MSAA) caps. */ - case PIPE_CAP_TEXTURE_MULTISAMPLE: - case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: - return (swr_screen(screen)->msaa_max_count > 1) ? 1 : 0; - case PIPE_CAP_FAKE_SW_MSAA: - return (swr_screen(screen)->msaa_max_count > 1) ? 0 : 1; - - /* fetch jit change for 2-4GB buffers requires alignment */ - case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: - return 1; - - /* unsupported features */ - case PIPE_CAP_TEXTURE_TRANSFER_MODES: - case PIPE_CAP_PCI_GROUP: - case PIPE_CAP_PCI_BUS: - case PIPE_CAP_PCI_DEVICE: - case PIPE_CAP_PCI_FUNCTION: - case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: - return 0; - case PIPE_CAP_MAX_GS_INVOCATIONS: - return 32; - case PIPE_CAP_MAX_SHADER_BUFFER_SIZE: - return 1 << 27; - case PIPE_CAP_MAX_VARYINGS: - return 32; - - case PIPE_CAP_VENDOR_ID: - return 0xFFFFFFFF; - case PIPE_CAP_DEVICE_ID: - return 0xFFFFFFFF; - case PIPE_CAP_ACCELERATED: - return 0; - case PIPE_CAP_VIDEO_MEMORY: { - /* XXX: Do we want to return the full amount of system memory ? */ - uint64_t system_memory; - - if (!os_get_total_physical_memory(&system_memory)) - return 0; - - return (int)(system_memory >> 20); - } - default: - return u_pipe_screen_get_param_defaults(screen, param); - } -} - -static int -swr_get_shader_param(struct pipe_screen *screen, - enum pipe_shader_type shader, - enum pipe_shader_cap param) -{ - if (shader != PIPE_SHADER_VERTEX && - shader != PIPE_SHADER_FRAGMENT && - shader != PIPE_SHADER_GEOMETRY && - shader != PIPE_SHADER_TESS_CTRL && - shader != PIPE_SHADER_TESS_EVAL) - return 0; - - if (param == PIPE_SHADER_CAP_MAX_SHADER_BUFFERS || - param == PIPE_SHADER_CAP_MAX_SHADER_IMAGES) { - return 0; - } - - return gallivm_get_shader_param(param); -} - - -static float -swr_get_paramf(struct pipe_screen *screen, enum pipe_capf param) -{ - switch (param) { - case PIPE_CAPF_MIN_LINE_WIDTH: - case PIPE_CAPF_MIN_LINE_WIDTH_AA: - case PIPE_CAPF_MIN_POINT_SIZE: - case PIPE_CAPF_MIN_POINT_SIZE_AA: - return 1; - case PIPE_CAPF_POINT_SIZE_GRANULARITY: - case PIPE_CAPF_LINE_WIDTH_GRANULARITY: - return 0.1; - case PIPE_CAPF_MAX_LINE_WIDTH: - case PIPE_CAPF_MAX_LINE_WIDTH_AA: - case PIPE_CAPF_MAX_POINT_SIZE: - return 255.0; /* arbitrary */ - case PIPE_CAPF_MAX_POINT_SIZE_AA: - return 0.0; - case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: - return 0.0; - case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: - return 16.0; /* arbitrary */ - case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: - case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: - case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: - return 0.0f; - } - /* should only get here on unhandled cases */ - debug_printf("Unexpected PIPE_CAPF %d query\n", param); - return 0.0; -} - -SWR_FORMAT -mesa_to_swr_format(enum pipe_format format) -{ - static const std::map<pipe_format,SWR_FORMAT> mesa2swr = { - /* depth / stencil */ - {PIPE_FORMAT_Z16_UNORM, R16_UNORM}, // z - {PIPE_FORMAT_Z32_FLOAT, R32_FLOAT}, // z - {PIPE_FORMAT_Z24_UNORM_S8_UINT, R24_UNORM_X8_TYPELESS}, // z - {PIPE_FORMAT_Z24X8_UNORM, R24_UNORM_X8_TYPELESS}, // z - {PIPE_FORMAT_Z32_FLOAT_S8X24_UINT, R32_FLOAT_X8X24_TYPELESS}, // z - - /* alpha */ - {PIPE_FORMAT_A8_UNORM, A8_UNORM}, - {PIPE_FORMAT_A16_UNORM, A16_UNORM}, - {PIPE_FORMAT_A16_FLOAT, A16_FLOAT}, - {PIPE_FORMAT_A32_FLOAT, A32_FLOAT}, - - /* odd sizes, bgr */ - {PIPE_FORMAT_B5G6R5_UNORM, B5G6R5_UNORM}, - {PIPE_FORMAT_B5G6R5_SRGB, B5G6R5_UNORM_SRGB}, - {PIPE_FORMAT_B5G5R5A1_UNORM, B5G5R5A1_UNORM}, - {PIPE_FORMAT_B5G5R5X1_UNORM, B5G5R5X1_UNORM}, - {PIPE_FORMAT_B4G4R4A4_UNORM, B4G4R4A4_UNORM}, - {PIPE_FORMAT_B8G8R8A8_UNORM, B8G8R8A8_UNORM}, - {PIPE_FORMAT_B8G8R8A8_SRGB, B8G8R8A8_UNORM_SRGB}, - {PIPE_FORMAT_B8G8R8X8_UNORM, B8G8R8X8_UNORM}, - {PIPE_FORMAT_B8G8R8X8_SRGB, B8G8R8X8_UNORM_SRGB}, - - /* rgb10a2 */ - {PIPE_FORMAT_R10G10B10A2_UNORM, R10G10B10A2_UNORM}, - {PIPE_FORMAT_R10G10B10A2_SNORM, R10G10B10A2_SNORM}, - {PIPE_FORMAT_R10G10B10A2_USCALED, R10G10B10A2_USCALED}, - {PIPE_FORMAT_R10G10B10A2_SSCALED, R10G10B10A2_SSCALED}, - {PIPE_FORMAT_R10G10B10A2_UINT, R10G10B10A2_UINT}, - - /* rgb10x2 */ - {PIPE_FORMAT_R10G10B10X2_USCALED, R10G10B10X2_USCALED}, - - /* bgr10a2 */ - {PIPE_FORMAT_B10G10R10A2_UNORM, B10G10R10A2_UNORM}, - {PIPE_FORMAT_B10G10R10A2_SNORM, B10G10R10A2_SNORM}, - {PIPE_FORMAT_B10G10R10A2_USCALED, B10G10R10A2_USCALED}, - {PIPE_FORMAT_B10G10R10A2_SSCALED, B10G10R10A2_SSCALED}, - {PIPE_FORMAT_B10G10R10A2_UINT, B10G10R10A2_UINT}, - - /* bgr10x2 */ - {PIPE_FORMAT_B10G10R10X2_UNORM, B10G10R10X2_UNORM}, - - /* r11g11b10 */ - {PIPE_FORMAT_R11G11B10_FLOAT, R11G11B10_FLOAT}, - - /* 32 bits per component */ - {PIPE_FORMAT_R32_FLOAT, R32_FLOAT}, - {PIPE_FORMAT_R32G32_FLOAT, R32G32_FLOAT}, - {PIPE_FORMAT_R32G32B32_FLOAT, R32G32B32_FLOAT}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, R32G32B32A32_FLOAT}, - {PIPE_FORMAT_R32G32B32X32_FLOAT, R32G32B32X32_FLOAT}, - - {PIPE_FORMAT_R32_USCALED, R32_USCALED}, - {PIPE_FORMAT_R32G32_USCALED, R32G32_USCALED}, - {PIPE_FORMAT_R32G32B32_USCALED, R32G32B32_USCALED}, - {PIPE_FORMAT_R32G32B32A32_USCALED, R32G32B32A32_USCALED}, - - {PIPE_FORMAT_R32_SSCALED, R32_SSCALED}, - {PIPE_FORMAT_R32G32_SSCALED, R32G32_SSCALED}, - {PIPE_FORMAT_R32G32B32_SSCALED, R32G32B32_SSCALED}, - {PIPE_FORMAT_R32G32B32A32_SSCALED, R32G32B32A32_SSCALED}, - - {PIPE_FORMAT_R32_UINT, R32_UINT}, - {PIPE_FORMAT_R32G32_UINT, R32G32_UINT}, - {PIPE_FORMAT_R32G32B32_UINT, R32G32B32_UINT}, - {PIPE_FORMAT_R32G32B32A32_UINT, R32G32B32A32_UINT}, - - {PIPE_FORMAT_R32_SINT, R32_SINT}, - {PIPE_FORMAT_R32G32_SINT, R32G32_SINT}, - {PIPE_FORMAT_R32G32B32_SINT, R32G32B32_SINT}, - {PIPE_FORMAT_R32G32B32A32_SINT, R32G32B32A32_SINT}, - - /* 16 bits per component */ - {PIPE_FORMAT_R16_UNORM, R16_UNORM}, - {PIPE_FORMAT_R16G16_UNORM, R16G16_UNORM}, - {PIPE_FORMAT_R16G16B16_UNORM, R16G16B16_UNORM}, - {PIPE_FORMAT_R16G16B16A16_UNORM, R16G16B16A16_UNORM}, - {PIPE_FORMAT_R16G16B16X16_UNORM, R16G16B16X16_UNORM}, - - {PIPE_FORMAT_R16_USCALED, R16_USCALED}, - {PIPE_FORMAT_R16G16_USCALED, R16G16_USCALED}, - {PIPE_FORMAT_R16G16B16_USCALED, R16G16B16_USCALED}, - {PIPE_FORMAT_R16G16B16A16_USCALED, R16G16B16A16_USCALED}, - - {PIPE_FORMAT_R16_SNORM, R16_SNORM}, - {PIPE_FORMAT_R16G16_SNORM, R16G16_SNORM}, - {PIPE_FORMAT_R16G16B16_SNORM, R16G16B16_SNORM}, - {PIPE_FORMAT_R16G16B16A16_SNORM, R16G16B16A16_SNORM}, - - {PIPE_FORMAT_R16_SSCALED, R16_SSCALED}, - {PIPE_FORMAT_R16G16_SSCALED, R16G16_SSCALED}, - {PIPE_FORMAT_R16G16B16_SSCALED, R16G16B16_SSCALED}, - {PIPE_FORMAT_R16G16B16A16_SSCALED, R16G16B16A16_SSCALED}, - - {PIPE_FORMAT_R16_UINT, R16_UINT}, - {PIPE_FORMAT_R16G16_UINT, R16G16_UINT}, - {PIPE_FORMAT_R16G16B16_UINT, R16G16B16_UINT}, - {PIPE_FORMAT_R16G16B16A16_UINT, R16G16B16A16_UINT}, - - {PIPE_FORMAT_R16_SINT, R16_SINT}, - {PIPE_FORMAT_R16G16_SINT, R16G16_SINT}, - {PIPE_FORMAT_R16G16B16_SINT, R16G16B16_SINT}, - {PIPE_FORMAT_R16G16B16A16_SINT, R16G16B16A16_SINT}, - - {PIPE_FORMAT_R16_FLOAT, R16_FLOAT}, - {PIPE_FORMAT_R16G16_FLOAT, R16G16_FLOAT}, - {PIPE_FORMAT_R16G16B16_FLOAT, R16G16B16_FLOAT}, - {PIPE_FORMAT_R16G16B16A16_FLOAT, R16G16B16A16_FLOAT}, - {PIPE_FORMAT_R16G16B16X16_FLOAT, R16G16B16X16_FLOAT}, - - /* 8 bits per component */ - {PIPE_FORMAT_R8_UNORM, R8_UNORM}, - {PIPE_FORMAT_R8G8_UNORM, R8G8_UNORM}, - {PIPE_FORMAT_R8G8B8_UNORM, R8G8B8_UNORM}, - {PIPE_FORMAT_R8G8B8_SRGB, R8G8B8_UNORM_SRGB}, - {PIPE_FORMAT_R8G8B8A8_UNORM, R8G8B8A8_UNORM}, - {PIPE_FORMAT_R8G8B8A8_SRGB, R8G8B8A8_UNORM_SRGB}, - {PIPE_FORMAT_R8G8B8X8_UNORM, R8G8B8X8_UNORM}, - {PIPE_FORMAT_R8G8B8X8_SRGB, R8G8B8X8_UNORM_SRGB}, - - {PIPE_FORMAT_R8_USCALED, R8_USCALED}, - {PIPE_FORMAT_R8G8_USCALED, R8G8_USCALED}, - {PIPE_FORMAT_R8G8B8_USCALED, R8G8B8_USCALED}, - {PIPE_FORMAT_R8G8B8A8_USCALED, R8G8B8A8_USCALED}, - - {PIPE_FORMAT_R8_SNORM, R8_SNORM}, - {PIPE_FORMAT_R8G8_SNORM, R8G8_SNORM}, - {PIPE_FORMAT_R8G8B8_SNORM, R8G8B8_SNORM}, - {PIPE_FORMAT_R8G8B8A8_SNORM, R8G8B8A8_SNORM}, - - {PIPE_FORMAT_R8_SSCALED, R8_SSCALED}, - {PIPE_FORMAT_R8G8_SSCALED, R8G8_SSCALED}, - {PIPE_FORMAT_R8G8B8_SSCALED, R8G8B8_SSCALED}, - {PIPE_FORMAT_R8G8B8A8_SSCALED, R8G8B8A8_SSCALED}, - - {PIPE_FORMAT_R8_UINT, R8_UINT}, - {PIPE_FORMAT_R8G8_UINT, R8G8_UINT}, - {PIPE_FORMAT_R8G8B8_UINT, R8G8B8_UINT}, - {PIPE_FORMAT_R8G8B8A8_UINT, R8G8B8A8_UINT}, - - {PIPE_FORMAT_R8_SINT, R8_SINT}, - {PIPE_FORMAT_R8G8_SINT, R8G8_SINT}, - {PIPE_FORMAT_R8G8B8_SINT, R8G8B8_SINT}, - {PIPE_FORMAT_R8G8B8A8_SINT, R8G8B8A8_SINT}, - - /* These formats are valid for vertex data, but should not be used - * for render targets. - */ - - {PIPE_FORMAT_R32_FIXED, R32_SFIXED}, - {PIPE_FORMAT_R32G32_FIXED, R32G32_SFIXED}, - {PIPE_FORMAT_R32G32B32_FIXED, R32G32B32_SFIXED}, - {PIPE_FORMAT_R32G32B32A32_FIXED, R32G32B32A32_SFIXED}, - - {PIPE_FORMAT_R64_FLOAT, R64_FLOAT}, - {PIPE_FORMAT_R64G64_FLOAT, R64G64_FLOAT}, - {PIPE_FORMAT_R64G64B64_FLOAT, R64G64B64_FLOAT}, - {PIPE_FORMAT_R64G64B64A64_FLOAT, R64G64B64A64_FLOAT}, - - /* These formats have entries in SWR but don't have Load/StoreTile - * implementations. That means these aren't renderable, and thus having - * a mapping entry here is detrimental. - */ - /* - - {PIPE_FORMAT_L8_UNORM, L8_UNORM}, - {PIPE_FORMAT_I8_UNORM, I8_UNORM}, - {PIPE_FORMAT_L8A8_UNORM, L8A8_UNORM}, - {PIPE_FORMAT_L16_UNORM, L16_UNORM}, - {PIPE_FORMAT_UYVY, YCRCB_SWAPUVY}, - - {PIPE_FORMAT_L8_SRGB, L8_UNORM_SRGB}, - {PIPE_FORMAT_L8A8_SRGB, L8A8_UNORM_SRGB}, - - {PIPE_FORMAT_DXT1_RGBA, BC1_UNORM}, - {PIPE_FORMAT_DXT3_RGBA, BC2_UNORM}, - {PIPE_FORMAT_DXT5_RGBA, BC3_UNORM}, - - {PIPE_FORMAT_DXT1_SRGBA, BC1_UNORM_SRGB}, - {PIPE_FORMAT_DXT3_SRGBA, BC2_UNORM_SRGB}, - {PIPE_FORMAT_DXT5_SRGBA, BC3_UNORM_SRGB}, - - {PIPE_FORMAT_RGTC1_UNORM, BC4_UNORM}, - {PIPE_FORMAT_RGTC1_SNORM, BC4_SNORM}, - {PIPE_FORMAT_RGTC2_UNORM, BC5_UNORM}, - {PIPE_FORMAT_RGTC2_SNORM, BC5_SNORM}, - - {PIPE_FORMAT_L16A16_UNORM, L16A16_UNORM}, - {PIPE_FORMAT_I16_UNORM, I16_UNORM}, - {PIPE_FORMAT_L16_FLOAT, L16_FLOAT}, - {PIPE_FORMAT_L16A16_FLOAT, L16A16_FLOAT}, - {PIPE_FORMAT_I16_FLOAT, I16_FLOAT}, - {PIPE_FORMAT_L32_FLOAT, L32_FLOAT}, - {PIPE_FORMAT_L32A32_FLOAT, L32A32_FLOAT}, - {PIPE_FORMAT_I32_FLOAT, I32_FLOAT}, - - {PIPE_FORMAT_I8_UINT, I8_UINT}, - {PIPE_FORMAT_L8_UINT, L8_UINT}, - {PIPE_FORMAT_L8A8_UINT, L8A8_UINT}, - - {PIPE_FORMAT_I8_SINT, I8_SINT}, - {PIPE_FORMAT_L8_SINT, L8_SINT}, - {PIPE_FORMAT_L8A8_SINT, L8A8_SINT}, - - */ - }; - - auto it = mesa2swr.find(format); - if (it == mesa2swr.end()) - return (SWR_FORMAT)-1; - else - return it->second; -} - -static bool -swr_displaytarget_layout(struct swr_screen *screen, struct swr_resource *res) -{ - struct sw_winsys *winsys = screen->winsys; - struct sw_displaytarget *dt; - - const unsigned width = align(res->swr.width, res->swr.halign); - const unsigned height = align(res->swr.height, res->swr.valign); - - UINT stride; - dt = winsys->displaytarget_create(winsys, - res->base.bind, - res->base.format, - width, height, - 64, NULL, - &stride); - - if (dt == NULL) - return false; - - void *map = winsys->displaytarget_map(winsys, dt, 0); - - res->display_target = dt; - res->swr.xpBaseAddress = (gfxptr_t)map; - - /* Clear the display target surface */ - if (map) - memset(map, 0, height * stride); - - winsys->displaytarget_unmap(winsys, dt); - - return true; -} - -static bool -swr_texture_layout(struct swr_screen *screen, - struct swr_resource *res, - bool allocate) -{ - struct pipe_resource *pt = &res->base; - - pipe_format fmt = pt->format; - const struct util_format_description *desc = util_format_description(fmt); - - res->has_depth = util_format_has_depth(desc); - res->has_stencil = util_format_has_stencil(desc); - - if (res->has_stencil && !res->has_depth) - fmt = PIPE_FORMAT_R8_UINT; - - /* We always use the SWR layout. For 2D and 3D textures this looks like: - * - * |<------- pitch ------->| - * +=======================+------- - * |Array 0 | ^ - * | | | - * | Level 0 | | - * | | | - * | | qpitch - * +-----------+-----------+ | - * | | L2L2L2L2 | | - * | Level 1 | L3L3 | | - * | | L4 | v - * +===========+===========+------- - * |Array 1 | - * | | - * | Level 0 | - * | | - * | | - * +-----------+-----------+ - * | | L2L2L2L2 | - * | Level 1 | L3L3 | - * | | L4 | - * +===========+===========+ - * - * The overall width in bytes is known as the pitch, while the overall - * height in rows is the qpitch. Array slices are laid out logically below - * one another, qpitch rows apart. For 3D surfaces, the "level" values are - * just invalid for the higher array numbers (since depth is also - * minified). 1D and 1D array surfaces are stored effectively the same way, - * except that pitch never plays into it. All the levels are logically - * adjacent to each other on the X axis. The qpitch becomes the number of - * elements between array slices, while the pitch is unused. - * - * Each level's sizes are subject to the valign and halign settings of the - * surface. For compressed formats that swr is unaware of, we will use an - * appropriately-sized uncompressed format, and scale the widths/heights. - * - * This surface is stored inside res->swr. For depth/stencil textures, - * res->secondary will have an identically-laid-out but R8_UINT-formatted - * stencil tree. In the Z32F_S8 case, the primary surface still has 64-bpp - * texels, to simplify map/unmap logic which copies the stencil values - * in/out. - */ - - res->swr.width = pt->width0; - res->swr.height = pt->height0; - res->swr.type = swr_convert_target_type(pt->target); - res->swr.tileMode = SWR_TILE_NONE; - res->swr.format = mesa_to_swr_format(fmt); - res->swr.numSamples = std::max(1u, pt->nr_samples); - - if (pt->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL)) { - res->swr.halign = KNOB_MACROTILE_X_DIM; - res->swr.valign = KNOB_MACROTILE_Y_DIM; - - /* If SWR_MSAA_FORCE_ENABLE is set, turn on MSAA and override requested - * surface sample count. */ - if (screen->msaa_force_enable) { - res->swr.numSamples = screen->msaa_max_count; - swr_print_info("swr_texture_layout: forcing sample count: %d\n", - res->swr.numSamples); - } - } else { - res->swr.halign = 1; - res->swr.valign = 1; - } - - unsigned halign = res->swr.halign * util_format_get_blockwidth(fmt); - unsigned width = align(pt->width0, halign); - if (pt->target == PIPE_TEXTURE_1D || pt->target == PIPE_TEXTURE_1D_ARRAY) { - for (int level = 1; level <= pt->last_level; level++) - width += align(u_minify(pt->width0, level), halign); - res->swr.pitch = util_format_get_blocksize(fmt); - res->swr.qpitch = util_format_get_nblocksx(fmt, width); - } else { - // The pitch is the overall width of the texture in bytes. Most of the - // time this is the pitch of level 0 since all the other levels fit - // underneath it. However in some degenerate situations, the width of - // level1 + level2 may be larger. In that case, we use those - // widths. This can happen if, e.g. halign is 32, and the width of level - // 0 is 32 or less. In that case, the aligned levels 1 and 2 will also - // be 32 each, adding up to 64. - unsigned valign = res->swr.valign * util_format_get_blockheight(fmt); - if (pt->last_level > 1) { - width = std::max<uint32_t>( - width, - align(u_minify(pt->width0, 1), halign) + - align(u_minify(pt->width0, 2), halign)); - } - res->swr.pitch = util_format_get_stride(fmt, width); - - // The qpitch is controlled by either the height of the second LOD, or - // the combination of all the later LODs. - unsigned height = align(pt->height0, valign); - if (pt->last_level == 1) { - height += align(u_minify(pt->height0, 1), valign); - } else if (pt->last_level > 1) { - unsigned level1 = align(u_minify(pt->height0, 1), valign); - unsigned level2 = 0; - for (int level = 2; level <= pt->last_level; level++) { - level2 += align(u_minify(pt->height0, level), valign); - } - height += std::max(level1, level2); - } - res->swr.qpitch = util_format_get_nblocksy(fmt, height); - } - - if (pt->target == PIPE_TEXTURE_3D) - res->swr.depth = pt->depth0; - else - res->swr.depth = pt->array_size; - - // Fix up swr format if necessary so that LOD offset computation works - if (res->swr.format == (SWR_FORMAT)-1) { - switch (util_format_get_blocksize(fmt)) { - default: - unreachable("Unexpected format block size"); - case 1: res->swr.format = R8_UINT; break; - case 2: res->swr.format = R16_UINT; break; - case 4: res->swr.format = R32_UINT; break; - case 8: - if (util_format_is_compressed(fmt)) - res->swr.format = BC4_UNORM; - else - res->swr.format = R32G32_UINT; - break; - case 16: - if (util_format_is_compressed(fmt)) - res->swr.format = BC5_UNORM; - else - res->swr.format = R32G32B32A32_UINT; - break; - } - } - - for (int level = 0; level <= pt->last_level; level++) { - res->mip_offsets[level] = - ComputeSurfaceOffset<false>(0, 0, 0, 0, 0, level, &res->swr); - } - - size_t total_size = (uint64_t)res->swr.depth * res->swr.qpitch * - res->swr.pitch * res->swr.numSamples; - - // Let non-sampled textures (e.g. buffer objects) bypass the size limit - if (swr_resource_is_texture(&res->base) && total_size > SWR_MAX_TEXTURE_SIZE) - return false; - - if (allocate) { - res->swr.xpBaseAddress = (gfxptr_t)AlignedMalloc(total_size, 64); - if (!res->swr.xpBaseAddress) - return false; - - if (res->has_depth && res->has_stencil) { - res->secondary = res->swr; - res->secondary.format = R8_UINT; - res->secondary.pitch = res->swr.pitch / util_format_get_blocksize(fmt); - - for (int level = 0; level <= pt->last_level; level++) { - res->secondary_mip_offsets[level] = - ComputeSurfaceOffset<false>(0, 0, 0, 0, 0, level, &res->secondary); - } - - total_size = res->secondary.depth * res->secondary.qpitch * - res->secondary.pitch * res->secondary.numSamples; - - res->secondary.xpBaseAddress = (gfxptr_t) AlignedMalloc(total_size, 64); - if (!res->secondary.xpBaseAddress) { - AlignedFree((void *)res->swr.xpBaseAddress); - return false; - } - } - } - - return true; -} - -static bool -swr_can_create_resource(struct pipe_screen *screen, - const struct pipe_resource *templat) -{ - struct swr_resource res; - memset(&res, 0, sizeof(res)); - res.base = *templat; - return swr_texture_layout(swr_screen(screen), &res, false); -} - -/* Helper function that conditionally creates a single-sample resolve resource - * and attaches it to main multisample resource. */ -static bool -swr_create_resolve_resource(struct pipe_screen *_screen, - struct swr_resource *msaa_res) -{ - struct swr_screen *screen = swr_screen(_screen); - - /* If resource is multisample, create a single-sample resolve resource */ - if (msaa_res->base.nr_samples > 1 || (screen->msaa_force_enable && - !(msaa_res->base.flags & SWR_RESOURCE_FLAG_ALT_SURFACE))) { - - /* Create a single-sample copy of the resource. Copy the original - * resource parameters and set flag to prevent recursion when re-calling - * resource_create */ - struct pipe_resource alt_template = msaa_res->base; - alt_template.nr_samples = 0; - alt_template.flags |= SWR_RESOURCE_FLAG_ALT_SURFACE; - - /* Note: Display_target is a special single-sample resource, only the - * display_target has been created already. */ - if (msaa_res->base.bind & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT - | PIPE_BIND_SHARED)) { - /* Allocate the multisample buffers. */ - if (!swr_texture_layout(screen, msaa_res, true)) - return false; - - /* Alt resource will only be bound as PIPE_BIND_RENDER_TARGET - * remove the DISPLAY_TARGET, SCANOUT, and SHARED bindings */ - alt_template.bind = PIPE_BIND_RENDER_TARGET; - } - - /* Allocate single-sample resolve surface */ - struct pipe_resource *alt; - alt = _screen->resource_create(_screen, &alt_template); - if (!alt) - return false; - - /* Attach it to the multisample resource */ - msaa_res->resolve_target = alt; - - /* Hang resolve surface state off the multisample surface state to so - * StoreTiles knows where to resolve the surface. */ - msaa_res->swr.xpAuxBaseAddress = (gfxptr_t)&swr_resource(alt)->swr; - } - - return true; /* success */ -} - -static struct pipe_resource * -swr_resource_create(struct pipe_screen *_screen, - const struct pipe_resource *templat) -{ - struct swr_screen *screen = swr_screen(_screen); - struct swr_resource *res = CALLOC_STRUCT(swr_resource); - if (!res) - return NULL; - - res->base = *templat; - pipe_reference_init(&res->base.reference, 1); - res->base.screen = &screen->base; - - if (swr_resource_is_texture(&res->base)) { - if (res->base.bind & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT - | PIPE_BIND_SHARED)) { - /* displayable surface - * first call swr_texture_layout without allocating to finish - * filling out the SWR_SURFACE_STATE in res */ - swr_texture_layout(screen, res, false); - if (!swr_displaytarget_layout(screen, res)) - goto fail; - } else { - /* texture map */ - if (!swr_texture_layout(screen, res, true)) - goto fail; - } - - /* If resource was multisample, create resolve resource and attach - * it to multisample resource. */ - if (!swr_create_resolve_resource(_screen, res)) - goto fail; - - } else { - /* other data (vertex buffer, const buffer, etc) */ - assert(util_format_get_blocksize(templat->format) == 1); - assert(templat->height0 == 1); - assert(templat->depth0 == 1); - assert(templat->last_level == 0); - - /* Easiest to just call swr_texture_layout, as it sets up - * SWR_SURFACE_STATE in res */ - if (!swr_texture_layout(screen, res, true)) - goto fail; - } - - return &res->base; - -fail: - FREE(res); - return NULL; -} - -static void -swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt) -{ - struct swr_screen *screen = swr_screen(p_screen); - struct swr_resource *spr = swr_resource(pt); - - if (spr->display_target) { - /* If resource is display target, winsys manages the buffer and will - * free it on displaytarget_destroy. */ - swr_fence_finish(p_screen, NULL, screen->flush_fence, 0); - - struct sw_winsys *winsys = screen->winsys; - winsys->displaytarget_destroy(winsys, spr->display_target); - - if (spr->swr.numSamples > 1) { - /* Free an attached resolve resource */ - struct swr_resource *alt = swr_resource(spr->resolve_target); - swr_fence_work_free(screen->flush_fence, (void*)(alt->swr.xpBaseAddress), true); - - /* Free multisample buffer */ - swr_fence_work_free(screen->flush_fence, (void*)(spr->swr.xpBaseAddress), true); - } - } else { - /* For regular resources, defer deletion */ - swr_resource_unused(pt); - - if (spr->swr.numSamples > 1) { - /* Free an attached resolve resource */ - struct swr_resource *alt = swr_resource(spr->resolve_target); - swr_fence_work_free(screen->flush_fence, (void*)(alt->swr.xpBaseAddress), true); - } - - swr_fence_work_free(screen->flush_fence, (void*)(spr->swr.xpBaseAddress), true); - swr_fence_work_free(screen->flush_fence, - (void*)(spr->secondary.xpBaseAddress), true); - - /* If work queue grows too large, submit a fence to force queue to - * drain. This is mainly to decrease the amount of memory used by the - * piglit streaming-texture-leak test */ - if (screen->pipe && swr_fence(screen->flush_fence)->work.count > 64) - swr_fence_submit(swr_context(screen->pipe), screen->flush_fence); - } - - FREE(spr); -} - - -static void -swr_flush_frontbuffer(struct pipe_screen *p_screen, - struct pipe_context *pipe, - struct pipe_resource *resource, - unsigned level, - unsigned layer, - void *context_private, - struct pipe_box *sub_box) -{ - struct swr_screen *screen = swr_screen(p_screen); - struct sw_winsys *winsys = screen->winsys; - struct swr_resource *spr = swr_resource(resource); - struct swr_context *ctx = swr_context(pipe); - - if (pipe) { - swr_fence_finish(p_screen, NULL, screen->flush_fence, 0); - swr_resource_unused(resource); - ctx->api.pfnSwrEndFrame(ctx->swrContext); - } - - /* Multisample resolved into resolve_target at flush with store_resource */ - if (pipe && spr->swr.numSamples > 1) { - struct pipe_resource *resolve_target = spr->resolve_target; - - /* Once resolved, copy into display target */ - SWR_SURFACE_STATE *resolve = &swr_resource(resolve_target)->swr; - - void *map = winsys->displaytarget_map(winsys, spr->display_target, - PIPE_MAP_WRITE); - memcpy(map, (void*)(resolve->xpBaseAddress), resolve->pitch * resolve->height); - winsys->displaytarget_unmap(winsys, spr->display_target); - } - - debug_assert(spr->display_target); - if (spr->display_target) - winsys->displaytarget_display( - winsys, spr->display_target, context_private, sub_box); -} - - -void -swr_destroy_screen_internal(struct swr_screen **screen) -{ - struct pipe_screen *p_screen = &(*screen)->base; - - swr_fence_finish(p_screen, NULL, (*screen)->flush_fence, 0); - swr_fence_reference(p_screen, &(*screen)->flush_fence, NULL); - - JitDestroyContext((*screen)->hJitMgr); - - if ((*screen)->pLibrary) - util_dl_close((*screen)->pLibrary); - - FREE(*screen); - *screen = NULL; -} - - -static void -swr_destroy_screen(struct pipe_screen *p_screen) -{ - struct swr_screen *screen = swr_screen(p_screen); - struct sw_winsys *winsys = screen->winsys; - - swr_print_info("SWR destroy screen!\n"); - - if (winsys->destroy) - winsys->destroy(winsys); - - swr_destroy_screen_internal(&screen); -} - - -static void -swr_validate_env_options(struct swr_screen *screen) -{ - /* The client_copy_limit sets a maximum on the amount of user-buffer memory - * copied to scratch space on a draw. Past this, the draw will access - * user-buffer directly and then block. This is faster than queuing many - * large client draws. */ - screen->client_copy_limit = SWR_CLIENT_COPY_LIMIT; - int client_copy_limit = - debug_get_num_option("SWR_CLIENT_COPY_LIMIT", SWR_CLIENT_COPY_LIMIT); - if (client_copy_limit > 0) - screen->client_copy_limit = client_copy_limit; - - /* XXX msaa under development, disable by default for now */ - screen->msaa_max_count = 1; /* was SWR_MAX_NUM_MULTISAMPLES; */ - - /* validate env override values, within range and power of 2 */ - int msaa_max_count = debug_get_num_option("SWR_MSAA_MAX_COUNT", 1); - if (msaa_max_count != 1) { - if ((msaa_max_count < 1) || (msaa_max_count > SWR_MAX_NUM_MULTISAMPLES) - || !util_is_power_of_two_or_zero(msaa_max_count)) { - fprintf(stderr, "SWR_MSAA_MAX_COUNT invalid: %d\n", msaa_max_count); - fprintf(stderr, "must be power of 2 between 1 and %d" \ - " (or 1 to disable msaa)\n", - SWR_MAX_NUM_MULTISAMPLES); - fprintf(stderr, "(msaa disabled)\n"); - msaa_max_count = 1; - } - - swr_print_info("SWR_MSAA_MAX_COUNT: %d\n", msaa_max_count); - - screen->msaa_max_count = msaa_max_count; - } - - screen->msaa_force_enable = debug_get_bool_option( - "SWR_MSAA_FORCE_ENABLE", false); - if (screen->msaa_force_enable) - swr_print_info("SWR_MSAA_FORCE_ENABLE: true\n"); -} - - -struct pipe_screen * -swr_create_screen_internal(struct sw_winsys *winsys) -{ - struct swr_screen *screen = CALLOC_STRUCT(swr_screen); - - if (!screen) - return NULL; - - if (!lp_build_init()) { - FREE(screen); - return NULL; - } - - screen->winsys = winsys; - screen->base.get_name = swr_get_name; - screen->base.get_vendor = swr_get_vendor; - screen->base.is_format_supported = swr_is_format_supported; - screen->base.context_create = swr_create_context; - screen->base.can_create_resource = swr_can_create_resource; - - screen->base.destroy = swr_destroy_screen; - screen->base.get_param = swr_get_param; - screen->base.get_shader_param = swr_get_shader_param; - screen->base.get_paramf = swr_get_paramf; - - screen->base.resource_create = swr_resource_create; - screen->base.resource_destroy = swr_resource_destroy; - - screen->base.flush_frontbuffer = swr_flush_frontbuffer; - - // Pass in "" for architecture for run-time determination - screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, "", "swr"); - - swr_fence_init(&screen->base); - - swr_validate_env_options(screen); - - return &screen->base; -} diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h deleted file mode 100644 index e66f5443357..00000000000 --- a/src/gallium/drivers/swr/swr_screen.h +++ /dev/null @@ -1,86 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#ifndef SWR_SCREEN_H -#define SWR_SCREEN_H - -#include "swr_resource.h" - -#include "pipe/p_screen.h" -#include "pipe/p_defines.h" -#include "util/u_dl.h" -#include "util/format/u_format.h" -#include "api.h" - -#include "memory/TilingFunctions.h" -#include "memory/InitMemory.h" -#include <stdio.h> -#include <stdarg.h> - -struct sw_winsys; - -struct swr_screen { - struct pipe_screen base; - struct pipe_context *pipe; - - struct pipe_fence_handle *flush_fence; - - struct sw_winsys *winsys; - - /* Configurable environment settings */ - bool msaa_force_enable; - uint8_t msaa_max_count; - uint32_t client_copy_limit; - - HANDLE hJitMgr; - - /* Dynamic backend implementations */ - util_dl_library *pLibrary; - PFNSwrGetInterface pfnSwrGetInterface; - PFNSwrGetTileInterface pfnSwrGetTileInterface; - - /* Do we run on Xeon Phi? */ - bool is_knl; -}; - -static INLINE struct swr_screen * -swr_screen(struct pipe_screen *pipe) -{ - return (struct swr_screen *)pipe; -} - -SWR_FORMAT -mesa_to_swr_format(enum pipe_format format); - -INLINE void swr_print_info(const char *format, ...) -{ - static bool print_info = debug_get_bool_option("SWR_PRINT_INFO", false); - if(print_info) { - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - } -} - -#endif diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp deleted file mode 100644 index 315036920fb..00000000000 --- a/src/gallium/drivers/swr/swr_shader.cpp +++ /dev/null @@ -1,3040 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include <llvm/Config/llvm-config.h> - -#if LLVM_VERSION_MAJOR < 7 -// llvm redefines DEBUG -#pragma push_macro("DEBUG") -#undef DEBUG -#endif - -#include "JitManager.h" -#include "llvm-c/Core.h" -#include "llvm/Support/CBindingWrapping.h" -#include "llvm/IR/LegacyPassManager.h" - -#if LLVM_VERSION_MAJOR < 7 -#pragma pop_macro("DEBUG") -#endif - -#include "state.h" -#include "gen_state_llvm.h" -#include "builder.h" -#include "functionpasses/passes.h" - -#include "tgsi/tgsi_strings.h" -#include "util/format/u_format.h" -#include "util/u_prim.h" -#include "gallivm/lp_bld_init.h" -#include "gallivm/lp_bld_flow.h" -#include "gallivm/lp_bld_struct.h" -#include "gallivm/lp_bld_tgsi.h" -#include "gallivm/lp_bld_const.h" -#include "gallivm/lp_bld_printf.h" -#include "gallivm/lp_bld_logic.h" - -#include "swr_context.h" -#include "gen_surf_state_llvm.h" -#include "gen_swr_context_llvm.h" -#include "swr_resource.h" -#include "swr_state.h" -#include "swr_screen.h" - - -///////////////////////////////////////////////////////////////////////// - -#include <stdio.h> -#include <inttypes.h> - -#include "util/u_debug.h" -#include "util/u_memory.h" -#include "util/u_string.h" - -#include "gallivm/lp_bld_type.h" - -#if defined(DEBUG) && defined(SWR_VERBOSE_SHADER) -constexpr bool verbose_shader = true; -constexpr bool verbose_tcs_shader_in = true; -constexpr bool verbose_tcs_shader_out = true; -constexpr bool verbose_tcs_shader_loop = true; -constexpr bool verbose_vs_shader = true; -#else -constexpr bool verbose_shader = false; -constexpr bool verbose_tcs_shader_in = false; -constexpr bool verbose_tcs_shader_out = false; -constexpr bool verbose_tcs_shader_loop = false; -constexpr bool verbose_vs_shader = false; -#endif - -using namespace SwrJit; - -static unsigned -locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info); - -bool operator==(const swr_jit_fs_key &lhs, const swr_jit_fs_key &rhs) -{ - return !memcmp(&lhs, &rhs, sizeof(lhs)); -} - -bool operator==(const swr_jit_vs_key &lhs, const swr_jit_vs_key &rhs) -{ - return !memcmp(&lhs, &rhs, sizeof(lhs)); -} - -bool operator==(const swr_jit_fetch_key &lhs, const swr_jit_fetch_key &rhs) -{ - return !memcmp(&lhs, &rhs, sizeof(lhs)); -} - -bool operator==(const swr_jit_gs_key &lhs, const swr_jit_gs_key &rhs) -{ - return !memcmp(&lhs, &rhs, sizeof(lhs)); -} - -bool operator==(const swr_jit_tcs_key &lhs, const swr_jit_tcs_key &rhs) -{ - return !memcmp(&lhs, &rhs, sizeof(lhs)); -} - -bool operator==(const swr_jit_tes_key &lhs, const swr_jit_tes_key &rhs) -{ - return !memcmp(&lhs, &rhs, sizeof(lhs)); -} - - -static void -swr_generate_sampler_key(const struct lp_tgsi_info &info, - struct swr_context *ctx, - enum pipe_shader_type shader_type, - struct swr_jit_sampler_key &key) -{ - key.nr_samplers = info.base.file_max[TGSI_FILE_SAMPLER] + 1; - - for (unsigned i = 0; i < key.nr_samplers; i++) { - if (info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { - lp_sampler_static_sampler_state( - &key.sampler[i].sampler_state, - ctx->samplers[shader_type][i]); - } - } - - /* - * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes - * are dx10-style? Can't really have mixed opcodes, at least not - * if we want to skip the holes here (without rescanning tgsi). - */ - if (info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { - key.nr_sampler_views = - info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; - for (unsigned i = 0; i < key.nr_sampler_views; i++) { - if (info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1u << (i & 31))) { - const struct pipe_sampler_view *view = - ctx->sampler_views[shader_type][i]; - lp_sampler_static_texture_state( - &key.sampler[i].texture_state, view); - if (view) { - struct swr_resource *swr_res = swr_resource(view->texture); - const struct util_format_description *desc = - util_format_description(view->format); - if (swr_res->has_depth && swr_res->has_stencil && - !util_format_has_depth(desc)) - key.sampler[i].texture_state.format = PIPE_FORMAT_S8_UINT; - } - } - } - } else { - key.nr_sampler_views = key.nr_samplers; - for (unsigned i = 0; i < key.nr_sampler_views; i++) { - if (info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { - const struct pipe_sampler_view *view = - ctx->sampler_views[shader_type][i]; - lp_sampler_static_texture_state( - &key.sampler[i].texture_state, view); - if (view) { - struct swr_resource *swr_res = swr_resource(view->texture); - const struct util_format_description *desc = - util_format_description(view->format); - if (swr_res->has_depth && swr_res->has_stencil && - !util_format_has_depth(desc)) - key.sampler[i].texture_state.format = PIPE_FORMAT_S8_UINT; - } - } - } - } -} - -void -swr_generate_fs_key(struct swr_jit_fs_key &key, - struct swr_context *ctx, - swr_fragment_shader *swr_fs) -{ - memset((void*)&key, 0, sizeof(key)); - - key.nr_cbufs = ctx->framebuffer.nr_cbufs; - key.light_twoside = ctx->rasterizer->light_twoside; - key.sprite_coord_enable = ctx->rasterizer->sprite_coord_enable; - - struct tgsi_shader_info *pPrevShader; - if (ctx->gs) - pPrevShader = &ctx->gs->info.base; - else if (ctx->tes) - pPrevShader = &ctx->tes->info.base; - else - pPrevShader = &ctx->vs->info.base; - - memcpy(&key.vs_output_semantic_name, - &pPrevShader->output_semantic_name, - sizeof(key.vs_output_semantic_name)); - memcpy(&key.vs_output_semantic_idx, - &pPrevShader->output_semantic_index, - sizeof(key.vs_output_semantic_idx)); - - swr_generate_sampler_key(swr_fs->info, ctx, PIPE_SHADER_FRAGMENT, key); - - key.poly_stipple_enable = ctx->rasterizer->poly_stipple_enable && - ctx->poly_stipple.prim_is_poly; -} - -void -swr_generate_vs_key(struct swr_jit_vs_key &key, - struct swr_context *ctx, - swr_vertex_shader *swr_vs) -{ - memset((void*)&key, 0, sizeof(key)); - - key.clip_plane_mask = - swr_vs->info.base.clipdist_writemask ? - swr_vs->info.base.clipdist_writemask & ctx->rasterizer->clip_plane_enable : - ctx->rasterizer->clip_plane_enable; - - swr_generate_sampler_key(swr_vs->info, ctx, PIPE_SHADER_VERTEX, key); -} - -void -swr_generate_fetch_key(struct swr_jit_fetch_key &key, - struct swr_vertex_element_state *velems) -{ - memset((void*)&key, 0, sizeof(key)); - - key.fsState = velems->fsState; -} - -void -swr_generate_gs_key(struct swr_jit_gs_key &key, - struct swr_context *ctx, - swr_geometry_shader *swr_gs) -{ - memset((void*)&key, 0, sizeof(key)); - - struct tgsi_shader_info *pPrevShader = nullptr; - - if (ctx->tes) { - pPrevShader = &ctx->tes->info.base; - } else { - pPrevShader = &ctx->vs->info.base; - } - - memcpy(&key.vs_output_semantic_name, - &pPrevShader->output_semantic_name, - sizeof(key.vs_output_semantic_name)); - memcpy(&key.vs_output_semantic_idx, - &pPrevShader->output_semantic_index, - sizeof(key.vs_output_semantic_idx)); - - swr_generate_sampler_key(swr_gs->info, ctx, PIPE_SHADER_GEOMETRY, key); -} - -void -swr_generate_tcs_key(struct swr_jit_tcs_key &key, - struct swr_context *ctx, - swr_tess_control_shader *swr_tcs) -{ - memset((void*)&key, 0, sizeof(key)); - - struct tgsi_shader_info *pPrevShader = &ctx->vs->info.base; - - memcpy(&key.vs_output_semantic_name, - &pPrevShader->output_semantic_name, - sizeof(key.vs_output_semantic_name)); - memcpy(&key.vs_output_semantic_idx, - &pPrevShader->output_semantic_index, - sizeof(key.vs_output_semantic_idx)); - - key.clip_plane_mask = - swr_tcs->info.base.clipdist_writemask ? - swr_tcs->info.base.clipdist_writemask & ctx->rasterizer->clip_plane_enable : - ctx->rasterizer->clip_plane_enable; - - swr_generate_sampler_key(swr_tcs->info, ctx, PIPE_SHADER_TESS_CTRL, key); -} - -void -swr_generate_tes_key(struct swr_jit_tes_key &key, - struct swr_context *ctx, - swr_tess_evaluation_shader *swr_tes) -{ - memset((void*)&key, 0, sizeof(key)); - - struct tgsi_shader_info *pPrevShader = nullptr; - - if (ctx->tcs) { - pPrevShader = &ctx->tcs->info.base; - } - else { - pPrevShader = &ctx->vs->info.base; - } - - SWR_ASSERT(pPrevShader != nullptr, "TES: No TCS or VS defined"); - - memcpy(&key.prev_output_semantic_name, - &pPrevShader->output_semantic_name, - sizeof(key.prev_output_semantic_name)); - memcpy(&key.prev_output_semantic_idx, - &pPrevShader->output_semantic_index, - sizeof(key.prev_output_semantic_idx)); - - key.clip_plane_mask = - swr_tes->info.base.clipdist_writemask ? - swr_tes->info.base.clipdist_writemask & ctx->rasterizer->clip_plane_enable : - ctx->rasterizer->clip_plane_enable; - - swr_generate_sampler_key(swr_tes->info, ctx, PIPE_SHADER_TESS_EVAL, key); -} - -struct BuilderSWR : public Builder { - BuilderSWR(JitManager *pJitMgr, const char *pName) - : Builder(pJitMgr) - { - pJitMgr->SetupNewModule(); - gallivm = gallivm_create(pName, wrap(&JM()->mContext), NULL); - pJitMgr->mpCurrentModule = unwrap(gallivm->module); - } - - ~BuilderSWR() { - gallivm_free_ir(gallivm); - } - - void WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, - unsigned slot, unsigned channel); - - struct gallivm_state *gallivm; - PFN_VERTEX_FUNC CompileVS(struct swr_context *ctx, swr_jit_vs_key &key); - PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_fs_key &key); - PFN_GS_FUNC CompileGS(struct swr_context *ctx, swr_jit_gs_key &key); - PFN_TCS_FUNC CompileTCS(struct swr_context *ctx, swr_jit_tcs_key &key); - PFN_TES_FUNC CompileTES(struct swr_context *ctx, swr_jit_tes_key &key); - - // GS-specific emit functions - LLVMValueRef - swr_gs_llvm_fetch_input(const struct lp_build_gs_iface *gs_iface, - struct lp_build_context * bld, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index); - void - swr_gs_llvm_emit_vertex(const struct lp_build_gs_iface *gs_base, - struct lp_build_context * bld, - LLVMValueRef (*outputs)[4], - LLVMValueRef emitted_vertices_vec, - LLVMValueRef stream_id); - - void - swr_gs_llvm_end_primitive(const struct lp_build_gs_iface *gs_base, - struct lp_build_context * bld, - LLVMValueRef total_emitted_vertices_vec_ptr, - LLVMValueRef verts_per_prim_vec, - LLVMValueRef emitted_prims_vec, - LLVMValueRef mask_vec); - - void - swr_gs_llvm_epilogue(const struct lp_build_gs_iface *gs_base, - LLVMValueRef total_emitted_vertices_vec, - LLVMValueRef emitted_prims_vec, unsigned stream); - - // TCS-specific emit functions - void swr_tcs_llvm_emit_prologue(struct lp_build_tgsi_soa_context* bld); - void swr_tcs_llvm_emit_epilogue(struct lp_build_tgsi_soa_context* bld); - - LLVMValueRef - swr_tcs_llvm_fetch_input(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_tgsi_context * bld_base, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index); - - LLVMValueRef - swr_tcs_llvm_fetch_output(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_tgsi_context * bld_base, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index, - uint32_t name); - - void - swr_tcs_llvm_store_output(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_tgsi_context * bld_base, - unsigned name, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index, - LLVMValueRef value, - LLVMValueRef mask_vec); - - // Barrier implementation (available only in TCS) - void - swr_tcs_llvm_emit_barrier(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_tgsi_context *bld_base); - - // TES-specific emit functions - LLVMValueRef - swr_tes_llvm_fetch_vtx_input(const struct lp_build_tes_iface *tes_iface, - struct lp_build_tgsi_context * bld_base, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index); - - LLVMValueRef - swr_tes_llvm_fetch_patch_input(const struct lp_build_tes_iface *tes_iface, - struct lp_build_tgsi_context * bld_base, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index); -}; - -struct swr_gs_llvm_iface { - struct lp_build_gs_iface base; - struct tgsi_shader_info *info; - - BuilderSWR *pBuilder; - - Value *pGsCtx; - SWR_GS_STATE *pGsState; - uint32_t num_outputs; - uint32_t num_verts_per_prim; - - Value *pVtxAttribMap; -}; - -struct swr_tcs_llvm_iface { - struct lp_build_tcs_iface base; - struct tgsi_shader_info *info; - - BuilderSWR *pBuilder; - - Value *pTcsCtx; - SWR_TS_STATE *pTsState; - - uint32_t output_vertices; - - LLVMValueRef loop_var; - - Value *pVtxAttribMap; - Value *pVtxOutputAttribMap; - Value *pPatchOutputAttribMap; -}; - -struct swr_tes_llvm_iface { - struct lp_build_tes_iface base; - struct tgsi_shader_info *info; - - BuilderSWR *pBuilder; - - Value *pTesCtx; - SWR_TS_STATE *pTsState; - - uint32_t num_outputs; - - Value *pVtxAttribMap; - Value *pPatchAttribMap; -}; - -// trampoline functions so we can use the builder llvm construction methods -static LLVMValueRef -swr_gs_llvm_fetch_input(const struct lp_build_gs_iface *gs_iface, - struct lp_build_context * bld, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index) -{ - swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_iface; - - return iface->pBuilder->swr_gs_llvm_fetch_input(gs_iface, bld, - is_vindex_indirect, - vertex_index, - is_aindex_indirect, - attrib_index, - swizzle_index); -} - -static void -swr_gs_llvm_emit_vertex(const struct lp_build_gs_iface *gs_base, - struct lp_build_context * bld, - LLVMValueRef (*outputs)[4], - LLVMValueRef emitted_vertices_vec, - LLVMValueRef mask_vec, - LLVMValueRef stream_id) -{ - swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - - iface->pBuilder->swr_gs_llvm_emit_vertex(gs_base, bld, - outputs, - emitted_vertices_vec, - stream_id); -} - -static void -swr_gs_llvm_end_primitive(const struct lp_build_gs_iface *gs_base, - struct lp_build_context * bld, - LLVMValueRef total_emitted_vertices_vec_ptr, - LLVMValueRef verts_per_prim_vec, - LLVMValueRef emitted_prims_vec, - LLVMValueRef mask_vec, unsigned stream_id) -{ - swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - - iface->pBuilder->swr_gs_llvm_end_primitive(gs_base, bld, - total_emitted_vertices_vec_ptr, - verts_per_prim_vec, - emitted_prims_vec, - mask_vec); -} - -static void -swr_gs_llvm_epilogue(const struct lp_build_gs_iface *gs_base, - LLVMValueRef total_emitted_vertices_vec, - LLVMValueRef emitted_prims_vec, unsigned stream) -{ - swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - - iface->pBuilder->swr_gs_llvm_epilogue(gs_base, - total_emitted_vertices_vec, - emitted_prims_vec, stream); -} - -static LLVMValueRef -swr_tcs_llvm_fetch_input(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_context * bld, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - boolean is_sindex_indirect, - LLVMValueRef swizzle_index) -{ - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; - struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld; - - return iface->pBuilder->swr_tcs_llvm_fetch_input(tcs_iface, bld_base, - is_vindex_indirect, - vertex_index, - is_aindex_indirect, - attrib_index, - swizzle_index); -} - -static LLVMValueRef -swr_tcs_llvm_fetch_output(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_context * bld, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - boolean is_sindex_indirect, - LLVMValueRef swizzle_index, - uint32_t name) -{ - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; - struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld; - - return iface->pBuilder->swr_tcs_llvm_fetch_output(tcs_iface, bld_base, - is_vindex_indirect, - vertex_index, - is_aindex_indirect, - attrib_index, - swizzle_index, - name); -} - - -static void -swr_tcs_llvm_emit_prologue(struct lp_build_context* bld) -{ - lp_build_tgsi_soa_context* bld_base = (lp_build_tgsi_soa_context*)bld; - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld_base->tcs_iface; - iface->pBuilder->swr_tcs_llvm_emit_prologue(bld_base); -} - -static void -swr_tcs_llvm_emit_epilogue(struct lp_build_context* bld) -{ - lp_build_tgsi_soa_context* bld_base = (lp_build_tgsi_soa_context*)bld; - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld_base->tcs_iface; - iface->pBuilder->swr_tcs_llvm_emit_epilogue(bld_base); -} - -static -void swr_tcs_llvm_store_output(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_context * bld, - unsigned name, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - boolean is_sindex_indirect, - LLVMValueRef swizzle_index, - LLVMValueRef value, - LLVMValueRef mask_vec) -{ - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; - struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld; - - iface->pBuilder->swr_tcs_llvm_store_output(tcs_iface, - bld_base, - name, - is_vindex_indirect, - vertex_index, - is_aindex_indirect, - attrib_index, - swizzle_index, - value, - mask_vec); -} - - -static -void swr_tcs_llvm_emit_barrier(struct lp_build_context *bld) -{ - lp_build_tgsi_soa_context* bld_base = (lp_build_tgsi_soa_context*)bld; - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld_base->tcs_iface; - - iface->pBuilder->swr_tcs_llvm_emit_barrier(bld_base->tcs_iface, &bld_base->bld_base); -} - - -static LLVMValueRef -swr_tes_llvm_fetch_vtx_input(const struct lp_build_tes_iface *tes_iface, - struct lp_build_context * bld, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - boolean is_sindex_indirect, - LLVMValueRef swizzle_index) -{ - swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface; - struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld; - - return iface->pBuilder->swr_tes_llvm_fetch_vtx_input(tes_iface, bld_base, - is_vindex_indirect, - vertex_index, - is_aindex_indirect, - attrib_index, - swizzle_index); -} - -static LLVMValueRef -swr_tes_llvm_fetch_patch_input(const struct lp_build_tes_iface *tes_iface, - struct lp_build_context * bld, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index) -{ - swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface; - struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld; - - return iface->pBuilder->swr_tes_llvm_fetch_patch_input(tes_iface, bld_base, - is_aindex_indirect, - attrib_index, - swizzle_index); -} - -LLVMValueRef -BuilderSWR::swr_gs_llvm_fetch_input(const struct lp_build_gs_iface *gs_iface, - struct lp_build_context * bld, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index) -{ - swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_iface; - Value *vert_index = unwrap(vertex_index); - Value *attr_index = unwrap(attrib_index); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - if (is_vindex_indirect || is_aindex_indirect) { - int i; - Value *res = unwrap(bld->zero); - struct lp_type type = bld->type; - - for (i = 0; i < type.length; i++) { - Value *vert_chan_index = vert_index; - Value *attr_chan_index = attr_index; - - if (is_vindex_indirect) { - vert_chan_index = VEXTRACT(vert_index, C(i)); - } - if (is_aindex_indirect) { - attr_chan_index = VEXTRACT(attr_index, C(i)); - } - - Value *attrib = - LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_chan_index})); - - Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts}); - Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride}); - - Value *pVector = ADD(MUL(vert_chan_index, pInputVertStride), attrib); - Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)})); - - Value *value = VEXTRACT(pInput, C(i)); - res = VINSERT(res, value, C(i)); - } - - return wrap(res); - } else { - Value *attrib = LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_index})); - - Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts}); - Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride}); - - Value *pVector = ADD(MUL(vert_index, pInputVertStride), attrib); - - Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)})); - - return wrap(pInput); - } -} - -// GS output stream layout -#define VERTEX_COUNT_SIZE 32 -#define CONTROL_HEADER_SIZE (8*32) - -void -BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_gs_iface *gs_base, - struct lp_build_context * bld, - LLVMValueRef (*outputs)[4], - LLVMValueRef emitted_vertices_vec, - LLVMValueRef stream_id) -{ - swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - const uint32_t headerSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE; - const uint32_t attribSize = 4 * sizeof(float); - const uint32_t vertSize = attribSize * SWR_VTX_NUM_SLOTS; - Value *pVertexOffset = MUL(unwrap(emitted_vertices_vec), VIMMED1(vertSize)); - - Value *vMask = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_mask}); - Value *vMask1 = TRUNC(vMask, getVectorType(mInt1Ty, mVWidth)); - - Value *pStack = STACKSAVE(); - Value *pTmpPtr = ALLOCA(mFP32Ty, C(4)); // used for dummy write for lane masking - - for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) { - uint32_t attribSlot = attrib; - uint32_t sgvChannel = 0; - if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) { - attribSlot = VERTEX_SGV_SLOT; - sgvChannel = VERTEX_SGV_POINT_SIZE_COMP; - } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER) { - attribSlot = VERTEX_SGV_SLOT; - sgvChannel = VERTEX_SGV_RTAI_COMP; - } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_VIEWPORT_INDEX) { - attribSlot = VERTEX_SGV_SLOT; - sgvChannel = VERTEX_SGV_VAI_COMP; - } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_POSITION) { - attribSlot = VERTEX_POSITION_SLOT; - } else { - attribSlot = VERTEX_ATTRIB_START_SLOT + attrib; - if (iface->info->writes_position) { - attribSlot--; - } - } - - Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + attribSize * attribSlot)); // + sgvChannel ? - - for (uint32_t lane = 0; lane < mVWidth; ++lane) { - Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane)); - Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); - Value *pStreamOffset = GEP(pStream, pLaneOffset); - pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy); - - Value *pLaneMask = VEXTRACT(vMask1, C(lane)); - pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr); - - for (uint32_t channel = 0; channel < 4; ++channel) { - Value *vData; - - if (attribSlot == VERTEX_SGV_SLOT) - vData = LOAD(unwrap(outputs[attrib][0])); - else - vData = LOAD(unwrap(outputs[attrib][channel])); - - if (attribSlot != VERTEX_SGV_SLOT || - sgvChannel == channel) { - vData = VEXTRACT(vData, C(lane)); - STORE(vData, pStreamOffset); - } - pStreamOffset = GEP(pStreamOffset, C(1)); - } - } - } - - /* When the output type is not points, the geometry shader may not - * output data to multiple streams. So early exit here. - */ - if(iface->pGsState->outputTopology != TOP_POINT_LIST) { - STACKRESTORE(pStack); - return; - } - - // Info about stream id for each vertex - // is coded in 2 bits (4 vert per byte "box"): - // ----------------- ----------------- ---- - // |d|d|c|c|b|b|a|a| |h|h|g|g|f|f|e|e| |... - // ----------------- ----------------- ---- - - // Calculate where need to put stream id for current vert - // in 1 byte "box". - Value *pShiftControl = MUL(unwrap(emitted_vertices_vec), VIMMED1(2)); - - // Calculate in which box put stream id for current vert. - Value *pOffsetControl = LSHR(unwrap(emitted_vertices_vec), VIMMED1(2)); - - // Skip count header - Value *pStreamIdOffset = ADD(pOffsetControl, VIMMED1(VERTEX_COUNT_SIZE)); - - for (uint32_t lane = 0; lane < mVWidth; ++lane) { - Value *pShift = TRUNC(VEXTRACT(pShiftControl, C(lane)), mInt8Ty); - Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); - - Value *pStreamOffset = GEP(pStream, VEXTRACT(pStreamIdOffset, C(lane))); - - // Just make sure that not overflow max - stream id = (0,1,2,3) - Value *vVal = TRUNC(AND(VEXTRACT(unwrap(stream_id), C(0)), C(0x3)), mInt8Ty); - - // Shift it to correct position in byte "box" - vVal = SHL(vVal, pShift); - - // Info about other vertices can be already stored - // so we need to read and add bits from current vert info. - Value *storedValue = LOAD(pStreamOffset); - vVal = OR(storedValue, vVal); - STORE(vVal, pStreamOffset); - } - - STACKRESTORE(pStack); -} - -void -BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_gs_iface *gs_base, - struct lp_build_context * bld, - LLVMValueRef total_emitted_vertices_vec, - LLVMValueRef verts_per_prim_vec, - LLVMValueRef emitted_prims_vec, - LLVMValueRef mask_vec) -{ - swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - - /* When the output type is points, the geometry shader may output data - * to multiple streams, and end_primitive has no effect. Info about - * stream id for vertices is stored into the same place in memory where - * end primitive info is stored so early exit in this case. - */ - if (iface->pGsState->outputTopology == TOP_POINT_LIST) { - return; - } - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask }); - Value *vMask1 = TRUNC(vMask, getVectorType(mInt1Ty, 8)); - - uint32_t vertsPerPrim = iface->num_verts_per_prim; - - Value *vCount = - ADD(MUL(unwrap(emitted_prims_vec), VIMMED1(vertsPerPrim)), - unwrap(verts_per_prim_vec)); - - vCount = unwrap(total_emitted_vertices_vec); - - Value *mask = unwrap(mask_vec); - Value *cmpMask = VMASK(ICMP_NE(unwrap(verts_per_prim_vec), VIMMED1(0))); - mask = AND(mask, cmpMask); - vMask1 = TRUNC(mask, getVectorType(mInt1Ty, 8)); - - vCount = SUB(vCount, VIMMED1(1)); - Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), VIMMED1(VERTEX_COUNT_SIZE)); - Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8))); - - vValue = TRUNC(vValue, getVectorType(mInt8Ty, 8)); - - Value *pStack = STACKSAVE(); - Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for lane masking - - for (uint32_t lane = 0; lane < mVWidth; ++lane) { - Value *vLaneOffset = VEXTRACT(vOffset, C(lane)); - Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); - Value *pStreamOffset = GEP(pStream, vLaneOffset); - - Value *pLaneMask = VEXTRACT(vMask1, C(lane)); - pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr); - - Value *vVal = LOAD(pStreamOffset); - vVal = OR(vVal, VEXTRACT(vValue, C(lane))); - STORE(vVal, pStreamOffset); - } - - STACKRESTORE(pStack); -} - -void -BuilderSWR::swr_gs_llvm_epilogue(const struct lp_build_gs_iface *gs_base, - LLVMValueRef total_emitted_vertices_vec, - LLVMValueRef emitted_prims_vec, unsigned stream) -{ - swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - // Store emit count to each output stream in the first DWORD - for (uint32_t lane = 0; lane < mVWidth; ++lane) - { - Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); - pStream = BITCAST(pStream, mInt32PtrTy); - Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), C(lane)); - STORE(pLaneCount, pStream); - } -} - -void -BuilderSWR::swr_tcs_llvm_emit_prologue(struct lp_build_tgsi_soa_context* bld) -{ - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld->tcs_iface; - - Value* loop_var = ALLOCA(mSimdInt32Ty); - STORE(VBROADCAST(C(0)), loop_var); - - iface->loop_var = wrap(loop_var); - - lp_exec_bgnloop(&bld->exec_mask, true); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - bld->system_values.invocation_id = wrap((LOAD(unwrap(iface->loop_var)))); - - if (verbose_tcs_shader_loop) { - lp_build_print_value(gallivm, "Prologue LOOP Iteration BEGIN:", bld->system_values.invocation_id); - } - -} - -void -BuilderSWR::swr_tcs_llvm_emit_epilogue(struct lp_build_tgsi_soa_context* bld) -{ - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld->tcs_iface; - - struct lp_build_context *uint_bld = &bld->bld_base.uint_bld; - - STORE(ADD(LOAD(unwrap(iface->loop_var)), VBROADCAST(C(1))), unwrap(iface->loop_var)); - if (verbose_tcs_shader_loop) { - lp_build_print_value(gallivm, "Epilogue LOOP: ", wrap(LOAD(unwrap(iface->loop_var)))); - } - - LLVMValueRef tmp = lp_build_cmp(uint_bld, PIPE_FUNC_GEQUAL, wrap(LOAD(unwrap(iface->loop_var))), - wrap(VBROADCAST(C(iface->output_vertices)))); - lp_exec_mask_cond_push(&bld->exec_mask, tmp); - lp_exec_break(&bld->exec_mask, &bld->bld_base.pc, false); - lp_exec_mask_cond_pop(&bld->exec_mask); - lp_exec_endloop(bld->bld_base.base.gallivm, &bld->exec_mask); -} - -LLVMValueRef -BuilderSWR::swr_tcs_llvm_fetch_input(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_tgsi_context * bld_base, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index) -{ - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; - - Value *vert_index = unwrap(vertex_index); - Value *attr_index = unwrap(attrib_index); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - if (verbose_tcs_shader_in) { - lp_build_printf(gallivm, "[TCS IN][VTX] ======================================\n"); - lp_build_print_value(gallivm, "[TCS IN][VTX] vertex_index: ", vertex_index); - lp_build_print_value(gallivm, "[TCS IN][VTX] attrib_index: ", attrib_index); - lp_build_printf(gallivm, "[TCS IN][VTX] --------------------------------------\n"); - } - - Value *res = unwrap(bld_base->base.zero); - if (is_vindex_indirect || is_aindex_indirect) { - int i; - struct lp_type type = bld_base->base.type; - - for (i = 0; i < type.length; i++) { - Value *vert_chan_index = vert_index; - Value *attr_chan_index = attr_index; - - if (is_vindex_indirect) { - vert_chan_index = VEXTRACT(vert_index, C(i)); - } - if (is_aindex_indirect) { - attr_chan_index = VEXTRACT(attr_index, C(i)); - } - - Value *attrib = - LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_chan_index})); - - Value *pBase = GEP(iface->pTcsCtx, - { C(0), C(SWR_HS_CONTEXT_vert), vert_chan_index, - C(simdvertex_attrib), attrib, unwrap(swizzle_index), C(i) }); - - Value *val = LOAD(pBase); - - if (verbose_tcs_shader_in) { - lp_build_print_value(gallivm, "[TCS IN][VTX] vert_chan_index: ", wrap(vert_chan_index)); - lp_build_print_value(gallivm, "[TCS IN][VTX] attrib_index: ", attrib_index); - lp_build_print_value(gallivm, "[TCS IN][VTX] attr_chan_index: ", wrap(attr_index)); - lp_build_print_value(gallivm, "[TCS IN][VTX] attrib read from map: ", wrap(attrib)); - lp_build_print_value(gallivm, "[TCS IN][VTX] swizzle_index: ", swizzle_index); - lp_build_print_value(gallivm, "[TCS IN][VTX] Loaded: ", wrap(val)); - } - res = VINSERT(res, val, C(i)); - } - } else { - Value *attrib = LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_index})); - - Value *pBase = GEP(iface->pTcsCtx, - { C(0), C(SWR_HS_CONTEXT_vert), vert_index, - C(simdvertex_attrib), attrib, unwrap(swizzle_index) }); - - res = LOAD(pBase); - - if (verbose_tcs_shader_in) { - lp_build_print_value(gallivm, "[TCS IN][VTX] attrib_index: ", attrib_index); - lp_build_print_value(gallivm, "[TCS IN][VTX] attr_chan_index: ", wrap(attr_index)); - lp_build_print_value(gallivm, "[TCS IN][VTX] attrib read from map: ", wrap(attrib)); - lp_build_print_value(gallivm, "[TCS IN][VTX] swizzle_index: ", swizzle_index); - lp_build_print_value(gallivm, "[TCS IN][VTX] Loaded: ", wrap(res)); - } - } - if (verbose_tcs_shader_in) { - lp_build_print_value(gallivm, "[TCS IN][VTX] returning: ", wrap(res)); - } - return wrap(res); -} - -LLVMValueRef -BuilderSWR::swr_tcs_llvm_fetch_output(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_tgsi_context * bld_base, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index, - uint32_t name) -{ - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; - - Value *vert_index = unwrap(vertex_index); - Value *attr_index = unwrap(attrib_index); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - if (verbose_tcs_shader_in) { - lp_build_print_value(gallivm, "[TCS INOUT] Vertex index: ", vertex_index); - lp_build_print_value(gallivm, "[TCS INOUT] Attrib index: ", wrap(attr_index)); - lp_build_print_value(gallivm, "[TCS INOUT] Swizzle index: ", swizzle_index); - } - - Value* res = unwrap(bld_base->base.zero); - - for (uint32_t lane = 0; lane < mVWidth; lane++) { - Value* p1 = LOAD(iface->pTcsCtx, {0, SWR_HS_CONTEXT_pCPout}); - Value* pCpOut = GEP(p1, {lane}); - - Value *vert_chan_index = vert_index; - Value *attr_chan_index = attr_index; - - if (is_vindex_indirect) { - vert_chan_index = VEXTRACT(vert_index, C(lane)); - if (verbose_tcs_shader_in) { - lp_build_print_value(gallivm, "[TCS INOUT] Extracted vertex index: ", wrap(vert_chan_index)); - } - } - - if (is_aindex_indirect) { - attr_chan_index = VEXTRACT(attr_index, C(lane)); - if (verbose_tcs_shader_in) { - lp_build_print_value(gallivm, "[TCS INOUT] Extracted attrib index: ", wrap(attr_chan_index)); - } - } - - if (name == TGSI_SEMANTIC_TESSOUTER || name == TGSI_SEMANTIC_TESSINNER) { - Value* tessFactors = GEP(pCpOut, {(uint32_t)0, ScalarPatch_tessFactors}); - Value* tessFactorArray = nullptr; - if (name == TGSI_SEMANTIC_TESSOUTER) { - tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_OuterTessFactors}); - } else { - tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_InnerTessFactors}); - } - Value* tessFactor = GEP(tessFactorArray, {C(0), unwrap(swizzle_index)}); - res = VINSERT(res, LOAD(tessFactor), C(lane)); - if (verbose_tcs_shader_in) { - lp_build_print_value(gallivm, "[TCS INOUT][FACTOR] lane (patch-id): ", wrap(C(lane))); - lp_build_print_value(gallivm, "[TCS INOUT][FACTOR] loaded value: ", wrap(res)); - } - } else if (name == TGSI_SEMANTIC_PATCH) { - Value* attr_index_from_map = LOAD(GEP(iface->pPatchOutputAttribMap, {C(0), attr_chan_index})); - Value* attr_value = GEP(pCpOut, {C(0), C(ScalarPatch_patchData), C(ScalarCPoint_attrib), attr_index_from_map, unwrap(swizzle_index)}); - res = VINSERT(res, LOAD(attr_value), C(lane)); - if (verbose_tcs_shader_in) { - lp_build_print_value(gallivm, "[TCS INOUT][PATCH] attr index loaded from map: ", wrap(attr_index_from_map)); - lp_build_print_value(gallivm, "[TCS INOUT][PATCH] lane (patch-id): ", wrap(C(lane))); - lp_build_print_value(gallivm, "[TCS INOUT][PATCH] loaded value: ", wrap(res)); - } - } else { - // Generic attribute - Value *attrib = - LOAD(GEP(iface->pVtxOutputAttribMap, {C(0), attr_chan_index})); - if (verbose_tcs_shader_in) { - lp_build_print_value(gallivm, "[TCS INOUT][VTX] Attrib index from map: ", wrap(attrib)); - } - Value* attr_chan = GEP(pCpOut, {C(0), C(ScalarPatch_cp), vert_chan_index, - C(ScalarCPoint_attrib), attrib, unwrap(swizzle_index)}); - - res = VINSERT(res, LOAD(attr_chan), C(lane)); - if (verbose_tcs_shader_in) { - lp_build_print_value(gallivm, "[TCS INOUT][VTX] loaded value: ", wrap(res)); - } - } - } - - return wrap(res); -} - -void -BuilderSWR::swr_tcs_llvm_store_output(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_tgsi_context *bld_base, - unsigned name, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index, - LLVMValueRef value, - LLVMValueRef mask_vec) -{ - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; - struct lp_build_tgsi_soa_context* bld = (struct lp_build_tgsi_soa_context*)bld_base; - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - if (verbose_tcs_shader_out) { - lp_build_printf(gallivm, "[TCS OUT] =============================================\n"); - } - - if (verbose_tcs_shader_out) { - lp_build_print_value(gallivm, "[TCS OUT] Store mask: ", bld->exec_mask.exec_mask); - lp_build_print_value(gallivm, "[TCS OUT] Store value: ", value); - } - - Value *vert_index = unwrap(vertex_index); - Value *attr_index = unwrap(attrib_index); - - if (verbose_tcs_shader_out) { - lp_build_print_value(gallivm, "[TCS OUT] Vertex index: ", vertex_index); - lp_build_print_value(gallivm, "[TCS OUT] Attrib index: ", wrap(attr_index)); - lp_build_print_value(gallivm, "[TCS OUT] Swizzle index: ", swizzle_index); - } - - if (is_vindex_indirect) { - vert_index = VEXTRACT(vert_index, C(0)); - if (verbose_tcs_shader_out) { - lp_build_print_value(gallivm, "[TCS OUT] Extracted vertex index: ", vertex_index); - } - } - - if (is_aindex_indirect) { - attr_index = VEXTRACT(attr_index, C(0)); - if (verbose_tcs_shader_out) { - lp_build_print_value(gallivm, "[TCS OUT] Extracted attrib index: ", wrap(attr_index)); - } - } - - if (verbose_tcs_shader_out) { - if (bld->exec_mask.has_mask) { - lp_build_print_value(gallivm, "[TCS OUT] Exec mask: ", bld->exec_mask.exec_mask); - } - else { - lp_build_printf(gallivm, "[TCS OUT] has no mask\n"); - } - } - for (uint32_t lane = 0; lane < mVWidth; lane++) { - Value* p1 = LOAD(iface->pTcsCtx, {0, SWR_HS_CONTEXT_pCPout}); - Value* pCpOut = GEP(p1, {lane}); - - if (name == TGSI_SEMANTIC_TESSOUTER || name == TGSI_SEMANTIC_TESSINNER) { - Value* tessFactors = GEP(pCpOut, {(uint32_t)0, ScalarPatch_tessFactors}); - Value* tessFactorArray = nullptr; - if (name == TGSI_SEMANTIC_TESSOUTER) { - tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_OuterTessFactors}); - } else { - tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_InnerTessFactors}); - } - Value* tessFactor = GEP(tessFactorArray, {C(0), unwrap(swizzle_index)}); - Value* valueToStore = VEXTRACT(unwrap(value), C(lane)); - valueToStore = BITCAST(valueToStore, mFP32Ty); - if (mask_vec) { - Value *originalVal = LOAD(tessFactor); - Value *vMask = TRUNC(VEXTRACT(unwrap(mask_vec), C(lane)), mInt1Ty); - valueToStore = SELECT(vMask, valueToStore, originalVal); - } - STORE(valueToStore, tessFactor); - if (verbose_tcs_shader_out) - { - lp_build_print_value(gallivm, "[TCS OUT][FACTOR] Mask_vec mask: ", mask_vec); - lp_build_print_value(gallivm, "[TCS OUT][FACTOR] Stored value: ", wrap(valueToStore)); - } - } else if (name == TGSI_SEMANTIC_PATCH) { - Value* attrib = LOAD(GEP(iface->pPatchOutputAttribMap, {C(0), attr_index})); - if (verbose_tcs_shader_out) { - lp_build_print_value(gallivm, "[TCS OUT][PATCH] vert_index: ", wrap(vert_index)); - lp_build_print_value(gallivm, "[TCS OUT][PATCH] attr_index: ", wrap(attr_index)); - lp_build_print_value(gallivm, "[TCS OUT][PATCH] vert_index_indirect: ", wrap(C(is_vindex_indirect))); - lp_build_print_value(gallivm, "[TCS OUT][PATCH] attr_index_indirect: ", wrap(C(is_aindex_indirect))); - lp_build_print_value(gallivm, "[TCS OUT][PATCH] attr index loaded from map: ", wrap(attrib)); - } - Value* attr = GEP(pCpOut, {C(0), C(ScalarPatch_patchData), C(ScalarCPoint_attrib), attrib}); - Value* value_to_store = VEXTRACT(unwrap(value), C(lane)); - if (verbose_tcs_shader_out) { - lp_build_print_value(gallivm, "[TCS OUT][PATCH] lane (patch-id): ", wrap(C(lane))); - lp_build_print_value(gallivm, "[TCS OUT][PATCH] value to store: ", value); - lp_build_print_value(gallivm, "[TCS OUT][PATCH] per-patch value to store: ", wrap(value_to_store)); - lp_build_print_value(gallivm, "[TCS OUT][PATCH] chan_index: ", swizzle_index); - } - value_to_store = BITCAST(value_to_store, mFP32Ty); - if (mask_vec) { - Value *originalVal = LOADV(attr, {C(0), unwrap(swizzle_index)}); - Value *vMask = TRUNC(VEXTRACT(unwrap(mask_vec), C(lane)), mInt1Ty); - value_to_store = SELECT(vMask, value_to_store, originalVal); - if (verbose_tcs_shader_out) { - lp_build_print_value(gallivm, "[TCS OUT][PATCH] store mask: ", mask_vec); - lp_build_print_value(gallivm, "[TCS OUT][PATCH] loaded original value: ", wrap(originalVal)); - lp_build_print_value(gallivm, "[TCS OUT][PATCH] vMask: ", wrap(vMask)); - lp_build_print_value(gallivm, "[TCS OUT][PATCH] selected value to store: ", wrap(value_to_store)); - } - } - STOREV(value_to_store, attr, {C(0), unwrap(swizzle_index)}); - if (verbose_tcs_shader_out) { - lp_build_print_value(gallivm, "[TCS OUT][PATCH] stored value: ", wrap(value_to_store)); - } - } else { - Value* value_to_store = VEXTRACT(unwrap(value), C(lane)); - Value* attrib = LOAD(GEP(iface->pVtxOutputAttribMap, {C(0), attr_index})); - - if (verbose_tcs_shader_out) { - lp_build_printf(gallivm, "[TCS OUT] Writting attribute\n"); - lp_build_print_value(gallivm, "[TCS OUT][VTX] invocation_id: ", bld->system_values.invocation_id); - lp_build_print_value(gallivm, "[TCS OUT][VTX] attribIndex: ", wrap(attr_index)); - lp_build_print_value(gallivm, "[TCS OUT][VTX] attrib read from map: ", wrap(attrib)); - lp_build_print_value(gallivm, "[TCS OUT][VTX] chan_index: ", swizzle_index); - lp_build_print_value(gallivm, "[TCS OUT][VTX] value: ", value); - lp_build_print_value(gallivm, "[TCS OUT][VTX] value_to_store: ", wrap(value_to_store)); - } - - Value* attr_chan = GEP(pCpOut, {C(0), C(ScalarPatch_cp), - VEXTRACT(unwrap(bld->system_values.invocation_id), C(0)), - C(ScalarCPoint_attrib), attrib, unwrap(swizzle_index)}); - - // Mask output values if needed - value_to_store = BITCAST(value_to_store, mFP32Ty); - if (mask_vec) { - Value *originalVal = LOAD(attr_chan); - Value *vMask = TRUNC(VEXTRACT(unwrap(mask_vec), C(lane)), mInt1Ty); - value_to_store = SELECT(vMask, value_to_store, originalVal); - } - STORE(value_to_store, attr_chan); - if (verbose_tcs_shader_out) { - lp_build_print_value(gallivm, "[TCS OUT][VTX] Mask_vec mask: ", mask_vec); - lp_build_print_value(gallivm, "[TCS OUT][VTX] stored: ", wrap(value_to_store)); - } - } - } -} - -void -BuilderSWR::swr_tcs_llvm_emit_barrier(const struct lp_build_tcs_iface *tcs_iface, - struct lp_build_tgsi_context *bld_base) -{ - swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; - struct lp_build_tgsi_soa_context* bld = (struct lp_build_tgsi_soa_context*)bld_base; - - if (verbose_tcs_shader_loop) { - lp_build_print_value(gallivm, "Barrier LOOP: Iteration %d END\n", iface->loop_var); - } - - struct lp_build_context *uint_bld = &bld->bld_base.uint_bld; - - STORE(ADD(LOAD(unwrap(iface->loop_var)), VBROADCAST(C(1))), unwrap(iface->loop_var)); - - LLVMValueRef tmp = lp_build_cmp(uint_bld, PIPE_FUNC_GEQUAL, wrap(LOAD(unwrap(iface->loop_var))), - wrap(VBROADCAST(C(iface->output_vertices)))); - - lp_exec_mask_cond_push(&bld->exec_mask, tmp); - lp_exec_break(&bld->exec_mask, &bld->bld_base.pc, false); - lp_exec_mask_cond_pop(&bld->exec_mask); - lp_exec_endloop(bld->bld_base.base.gallivm, &bld->exec_mask); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - STORE(VBROADCAST(C(0)), unwrap(iface->loop_var)); - lp_exec_bgnloop(&bld->exec_mask, true); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - bld->system_values.invocation_id = wrap((LOAD(unwrap(iface->loop_var)))); - - if (verbose_tcs_shader_loop) { - lp_build_print_value(gallivm, "Barrier LOOP: Iteration BEGIN: ", iface->loop_var); - lp_build_print_value(gallivm, "Barrier LOOP: InvocationId: \n", bld->system_values.invocation_id); - } -} - - -LLVMValueRef -BuilderSWR::swr_tes_llvm_fetch_patch_input(const struct lp_build_tes_iface *tes_iface, - struct lp_build_tgsi_context * bld_base, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index) -{ - swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface; - Value *attr_index = unwrap(attrib_index); - Value *res = unwrap(bld_base->base.zero); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - if (verbose_shader) { - lp_build_printf(gallivm, "[TES IN][PATCH] --------------------------------------\n"); - } - - if (is_aindex_indirect) { - int i; - struct lp_type type = bld_base->base.type; - - for (i = 0; i < type.length; i++) { - Value *attr_chan_index = attr_index; - - if (is_aindex_indirect) { - attr_chan_index = VEXTRACT(attr_index, C(i)); - } - - Value *attrib = - LOAD(GEP(iface->pPatchAttribMap, {C(0), attr_chan_index})); - - Value *pCpIn = LOAD(iface->pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}, "pCpIn"); - Value *pPatchData = GEP(pCpIn, {(uint32_t)0, ScalarPatch_patchData}); - Value *pAttr = GEP(pPatchData, {(uint32_t)0, ScalarCPoint_attrib}); - Value *Val = LOADV(pAttr, {C(0), attrib, unwrap(swizzle_index)}); - if (verbose_shader) { - lp_build_print_value(gallivm, "[TES IN][PATCH] attrib_index: ", attrib_index); - lp_build_print_value(gallivm, "[TES IN][PATCH] attr_chan_index: ", wrap(attr_chan_index)); - lp_build_print_value(gallivm, "[TES IN][PATCH] attrib read from map: ", wrap(attrib)); - lp_build_print_value(gallivm, "[TES IN][PATCH] swizzle_index: ", swizzle_index); - lp_build_print_value(gallivm, "[TES IN][PATCH] Loaded: ", wrap(Val)); - } - res = VINSERT(res, Val, C(i)); - } - } else { - Value *attrib = LOAD(GEP(iface->pPatchAttribMap, {C(0), attr_index})); - - Value *pCpIn = LOAD(iface->pTesCtx, {(uint32_t)0, SWR_DS_CONTEXT_pCpIn}, "pCpIn"); - Value *pPatchData = GEP(pCpIn, {(uint32_t)0, ScalarPatch_patchData}); - Value *pAttr = GEP(pPatchData, {(uint32_t)0, ScalarCPoint_attrib}); - Value *Val = LOADV(pAttr, {C(0), attrib, unwrap(swizzle_index)}); - if (verbose_shader) { - lp_build_print_value(gallivm, "[TES IN][PATCH] attrib_index: ", attrib_index); - lp_build_print_value(gallivm, "[TES IN][PATCH] attr_chan_index: ", wrap(attr_index)); - lp_build_print_value(gallivm, "[TES IN][PATCH] attrib read from map: ", wrap(attrib)); - lp_build_print_value(gallivm, "[TES IN][PATCH] swizzle_index: ", swizzle_index); - lp_build_print_value(gallivm, "[TES IN][PATCH] Loaded: ", wrap(Val)); - } - res = VBROADCAST(Val); - } - if (verbose_shader) { - lp_build_print_value(gallivm, "[TES IN][PATCH] returning: ", wrap(res)); - } - return wrap(res); -} - - - -LLVMValueRef -BuilderSWR::swr_tes_llvm_fetch_vtx_input(const struct lp_build_tes_iface *tes_iface, - struct lp_build_tgsi_context * bld_base, - boolean is_vindex_indirect, - LLVMValueRef vertex_index, - boolean is_aindex_indirect, - LLVMValueRef attrib_index, - LLVMValueRef swizzle_index) -{ - swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface; - Value *vert_index = unwrap(vertex_index); - Value *attr_index = unwrap(attrib_index); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - if (verbose_shader) { - lp_build_printf(gallivm, "[TES IN][VTX] --------------------------------------\n"); - } - - Value *res = unwrap(bld_base->base.zero); - if (is_vindex_indirect || is_aindex_indirect) { - int i; - struct lp_type type = bld_base->base.type; - - for (i = 0; i < type.length; i++) { - Value *vert_chan_index = vert_index; - Value *attr_chan_index = attr_index; - - if (is_vindex_indirect) { - vert_chan_index = VEXTRACT(vert_index, C(i)); - } - if (is_aindex_indirect) { - attr_chan_index = VEXTRACT(attr_index, C(i)); - } - - Value *attrib = - LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_chan_index})); - - Value *pCpIn = LOAD(iface->pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}, "pCpIn"); - Value *pCp = GEP(pCpIn, {0, ScalarPatch_cp}); - Value *pVertex = GEP(pCp, {(Value*)C(0), vert_chan_index}); - Value *pAttrTab = GEP(pVertex, {uint32_t(0), uint32_t(0)}); - Value *pAttr = GEP(pAttrTab, {(Value*)C(0), attrib}); - Value *Val = LOADV(pAttr, {C(0), unwrap(swizzle_index)}); - if (verbose_shader) { - lp_build_print_value(gallivm, "[TES IN][VTX] attrib_index: ", attrib_index); - lp_build_print_value(gallivm, "[TES IN][VTX] attr_chan_index: ", wrap(attr_index)); - lp_build_print_value(gallivm, "[TES IN][VTX] attrib read from map: ", wrap(attrib)); - lp_build_print_value(gallivm, "[TES IN][VTX] swizzle_index: ", swizzle_index); - lp_build_print_value(gallivm, "[TES IN][VTX] Loaded: ", wrap(Val)); - } - res = VINSERT(res, Val, C(i)); - } - } else { - Value *attrib = LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_index})); - - Value *pCpIn = LOAD(iface->pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}, "pCpIn"); - Value *pCp = GEP(pCpIn, {0, ScalarPatch_cp}); - Value *pVertex = GEP(pCp, {(Value*)C(0), vert_index}); - Value *pAttrTab = GEP(pVertex, {uint32_t(0), uint32_t(0)}); - Value *pAttr = GEP(pAttrTab, {(Value*)C(0), attrib}); - Value *Val = LOADV(pAttr, {C(0), unwrap(swizzle_index)}); - if (verbose_shader) { - lp_build_print_value(gallivm, "[TES IN][VTX] attrib_index: ", attrib_index); - lp_build_print_value(gallivm, "[TES IN][VTX] attr_chan_index: ", wrap(attr_index)); - lp_build_print_value(gallivm, "[TES IN][VTX] attrib read from map: ", wrap(attrib)); - lp_build_print_value(gallivm, "[TES IN][VTX] swizzle_index: ", swizzle_index); - lp_build_print_value(gallivm, "[TES IN][VTX] Loaded: ", wrap(Val)); - } - res = VBROADCAST(Val); - } - if (verbose_shader) { - lp_build_print_value(gallivm, "[TES IN][VTX] returning: ", wrap(res)); - } - return wrap(res); -} - - - - -PFN_GS_FUNC -BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) -{ - SWR_GS_STATE *pGS = &ctx->gs->gsState; - struct tgsi_shader_info *info = &ctx->gs->info.base; - - memset(pGS, 0, sizeof(*pGS)); - - pGS->gsEnable = true; - - pGS->numInputAttribs = (VERTEX_ATTRIB_START_SLOT - VERTEX_POSITION_SLOT) + info->num_inputs; - pGS->outputTopology = - swr_convert_prim_topology(info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM], 0); - - /* It's +1 because emit_vertex in swr is always called exactly one time more - * than max_vertices passed in Geometry Shader. We need to allocate more memory - * to avoid crash/memory overwritten. - */ - pGS->maxNumVerts = info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] + 1; - pGS->instanceCount = info->properties[TGSI_PROPERTY_GS_INVOCATIONS]; - - // If point primitive then assume to use multiple streams - if(pGS->outputTopology == TOP_POINT_LIST) { - pGS->isSingleStream = false; - } else { - pGS->isSingleStream = true; - pGS->singleStreamID = 0; - } - - pGS->vertexAttribOffset = VERTEX_POSITION_SLOT; - pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset; - pGS->outputVertexSize = SWR_VTX_NUM_SLOTS; - pGS->controlDataSize = 8; // GS outputs max of 8 32B units - pGS->controlDataOffset = VERTEX_COUNT_SIZE; - pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE; - - pGS->allocationSize = - VERTEX_COUNT_SIZE + // vertex count - CONTROL_HEADER_SIZE + // control header - (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex - pGS->maxNumVerts; // num verts - - struct swr_geometry_shader *gs = ctx->gs; - - LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; - LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; - - memset(outputs, 0, sizeof(outputs)); - - AttrBuilder attrBuilder; - attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); - - std::vector<Type *> gsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), - PointerType::get(mInt8Ty, 0), - PointerType::get(Gen_SWR_GS_CONTEXT(JM()), 0)}; - FunctionType *vsFuncType = - FunctionType::get(Type::getVoidTy(JM()->mContext), gsArgs, false); - - // create new vertex shader function - auto pFunction = Function::Create(vsFuncType, - GlobalValue::ExternalLinkage, - "GS", - JM()->mpCurrentModule); -#if LLVM_VERSION_MAJOR < 5 - AttributeSet attrSet = AttributeSet::get( - JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); - pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); -#else - pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder); -#endif - - BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); - IRB()->SetInsertPoint(block); - LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); - - auto argitr = pFunction->arg_begin(); - Value *hPrivateData = &*argitr++; - hPrivateData->setName("hPrivateData"); - Value *pWorkerData = &*argitr++; - pWorkerData->setName("pWorkerData"); - Value *pGsCtx = &*argitr++; - pGsCtx->setName("gsCtx"); - - Value *consts_ptr = - GEP(hPrivateData, {C(0), C(swr_draw_context_constantGS)}); - consts_ptr->setName("gs_constants"); - Value *const_sizes_ptr = - GEP(hPrivateData, {0, swr_draw_context_num_constantsGS}); - const_sizes_ptr->setName("num_gs_constants"); - - struct lp_build_sampler_soa *sampler = - swr_sampler_soa_create(key.sampler, PIPE_SHADER_GEOMETRY); - assert(sampler != nullptr); - - struct lp_bld_tgsi_system_values system_values; - memset(&system_values, 0, sizeof(system_values)); - system_values.prim_id = wrap(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_PrimitiveID})); - system_values.invocation_id = wrap(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_InstanceID})); - - std::vector<Constant*> mapConstants; - Value *vtxAttribMap = ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); - for (unsigned slot = 0; slot < info->num_inputs; slot++) { - ubyte semantic_name = info->input_semantic_name[slot]; - ubyte semantic_idx = info->input_semantic_index[slot]; - - unsigned vs_slot = locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base); - assert(vs_slot < PIPE_MAX_SHADER_OUTPUTS); - - vs_slot += VERTEX_ATTRIB_START_SLOT; - - if (ctx->vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) - vs_slot--; - - if (semantic_name == TGSI_SEMANTIC_POSITION) - vs_slot = VERTEX_POSITION_SLOT; - - STORE(C(vs_slot), vtxAttribMap, {0, slot}); - mapConstants.push_back(C(vs_slot)); - } - - struct lp_build_mask_context mask; - Value *mask_val = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_mask}, "gsMask"); - lp_build_mask_begin(&mask, gallivm, - lp_type_float_vec(32, 32 * 8), wrap(mask_val)); - - // zero out cut buffer so we can load/modify/store bits - for (uint32_t lane = 0; lane < mVWidth; ++lane) - { - Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); -#if LLVM_VERSION_MAJOR >= 10 - MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, MaybeAlign(sizeof(float) * KNOB_SIMD_WIDTH)); -#else - MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, sizeof(float) * KNOB_SIMD_WIDTH); -#endif - } - - struct swr_gs_llvm_iface gs_iface; - gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input; - gs_iface.base.emit_vertex = ::swr_gs_llvm_emit_vertex; - gs_iface.base.end_primitive = ::swr_gs_llvm_end_primitive; - gs_iface.base.gs_epilogue = ::swr_gs_llvm_epilogue; - gs_iface.pBuilder = this; - gs_iface.pGsCtx = pGsCtx; - gs_iface.pGsState = pGS; - gs_iface.num_outputs = gs->info.base.num_outputs; - gs_iface.num_verts_per_prim = - u_vertices_per_prim((pipe_prim_type)info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]); - gs_iface.info = info; - gs_iface.pVtxAttribMap = vtxAttribMap; - - struct lp_build_tgsi_params params; - memset(¶ms, 0, sizeof(params)); - params.type = lp_type_float_vec(32, 32 * 8); - params.mask = & mask; - params.consts_ptr = wrap(consts_ptr); - params.const_sizes_ptr = wrap(const_sizes_ptr); - params.system_values = &system_values; - params.inputs = inputs; - params.context_ptr = wrap(hPrivateData); - params.sampler = sampler; - params.info = &gs->info.base; - params.gs_iface = &gs_iface.base; - - lp_build_tgsi_soa(gallivm, - gs->pipe.tokens, - ¶ms, - outputs); - - lp_build_mask_end(&mask); - - sampler->destroy(sampler); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - RET_VOID(); - - gallivm_verify_function(gallivm, wrap(pFunction)); - gallivm_compile_module(gallivm); - - PFN_GS_FUNC pFunc = - (PFN_GS_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); - - debug_printf("geom shader %p\n", pFunc); - assert(pFunc && "Error: GeomShader = NULL"); - - JM()->mIsModuleFinalized = true; - - return pFunc; -} - -PFN_TES_FUNC -BuilderSWR::CompileTES(struct swr_context *ctx, swr_jit_tes_key &key) -{ - SWR_TS_STATE *pTS = &ctx->tsState; - struct tgsi_shader_info *info = &ctx->tes->info.base; - - // tessellation is enabled if TES is present - // clear tessellation state here then - memset(pTS, 0, sizeof(*pTS)); - - pTS->tsEnable = true; - - unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE]; - unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING]; - bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW]; - bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE]; - SWR_TS_DOMAIN type = SWR_TS_ISOLINE; - SWR_TS_PARTITIONING partitioning = SWR_TS_EVEN_FRACTIONAL; - SWR_TS_OUTPUT_TOPOLOGY topology = SWR_TS_OUTPUT_POINT; - PRIMITIVE_TOPOLOGY postDSTopology = TOP_POINT_LIST; - - // TESS_TODO: move this to helper functions to improve readability - switch (tes_prim_mode) { - case PIPE_PRIM_LINES: - type = SWR_TS_ISOLINE; - postDSTopology = TOP_LINE_LIST; - break; - case PIPE_PRIM_TRIANGLES: - type = SWR_TS_TRI; - postDSTopology = TOP_TRIANGLE_LIST; - break; - case PIPE_PRIM_QUADS: - type = SWR_TS_QUAD; - // See OpenGL spec - quads are tessellated into triangles - postDSTopology = TOP_TRIANGLE_LIST; - break; - default: - assert(0); - } - - switch (tes_spacing) { - case PIPE_TESS_SPACING_FRACTIONAL_ODD: - partitioning = SWR_TS_ODD_FRACTIONAL; - break; - case PIPE_TESS_SPACING_FRACTIONAL_EVEN: - partitioning = SWR_TS_EVEN_FRACTIONAL; - break; - case PIPE_TESS_SPACING_EQUAL: - partitioning = SWR_TS_INTEGER; - break; - default: - assert(0); - } - - if (tes_point_mode) { - topology = SWR_TS_OUTPUT_POINT; - postDSTopology = TOP_POINT_LIST; - } - else if (tes_prim_mode == PIPE_PRIM_LINES) { - topology = SWR_TS_OUTPUT_LINE; - } - else if (tes_vertex_order_cw) { - topology = SWR_TS_OUTPUT_TRI_CW; - } - else { - topology = SWR_TS_OUTPUT_TRI_CCW; - } - - pTS->domain = type; - pTS->tsOutputTopology = topology; - pTS->partitioning = partitioning; - pTS->numDsOutputAttribs = info->num_outputs; - pTS->postDSTopology = postDSTopology; - - pTS->dsAllocationSize = SWR_VTX_NUM_SLOTS * MAX_NUM_VERTS_PER_PRIM; - pTS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; - pTS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; - pTS->dsOutVtxAttribOffset = VERTEX_ATTRIB_START_SLOT; - - struct swr_tess_evaluation_shader *tes = ctx->tes; - - LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; - LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; - - memset(outputs, 0, sizeof(outputs)); - - AttrBuilder attrBuilder; - attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); - - std::vector<Type *> tesArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), - PointerType::get(mInt8Ty, 0), - PointerType::get(Gen_SWR_DS_CONTEXT(JM()), 0)}; - FunctionType *tesFuncType = - FunctionType::get(Type::getVoidTy(JM()->mContext), tesArgs, false); - - // create new vertex shader function - auto pFunction = Function::Create(tesFuncType, - GlobalValue::ExternalLinkage, - "TES", - JM()->mpCurrentModule); - -#if LLVM_VERSION_MAJOR < 5 - AttributeSet attrSet = AttributeSet::get( - JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); - pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); -#else - pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder); -#endif - - BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); - IRB()->SetInsertPoint(block); - LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); - - auto argitr = pFunction->arg_begin(); - Value *hPrivateData = &*argitr++; - hPrivateData->setName("hPrivateData"); - Value *pWorkerData = &*argitr++; - pWorkerData->setName("pWorkerData"); - Value *pTesCtx = &*argitr++; - pTesCtx->setName("tesCtx"); - - Value *consts_ptr = - GEP(hPrivateData, {C(0), C(swr_draw_context_constantTES)}); - consts_ptr->setName("tes_constants"); - Value *const_sizes_ptr = - GEP(hPrivateData, {0, swr_draw_context_num_constantsTES}); - const_sizes_ptr->setName("num_tes_constants"); - - struct lp_build_sampler_soa *sampler = - swr_sampler_soa_create(key.sampler, PIPE_SHADER_TESS_EVAL); - assert(sampler != nullptr); - - struct lp_bld_tgsi_system_values system_values; - memset(&system_values, 0, sizeof(system_values)); - - // Load and calculate system values - // Tessellation coordinates (gl_TessCoord) - Value *vecOffset = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_vectorOffset}, "vecOffset"); - Value *vecStride = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_vectorStride}, "vecStride"); - Value *vecIndex = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_vectorOffset}); - - Value* tess_coord = ALLOCA(ArrayType::get(mSimdFP32Ty, 3)); - - Value *tessCoordU = LOADV(LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pDomainU}), {vecIndex}, "tessCoordU"); - STORE(tessCoordU, tess_coord, {0, 0}); - Value *tessCoordV = LOADV(LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pDomainV}), {vecIndex}, "tessCoordV"); - STORE(tessCoordV, tess_coord, {0, 1}); - Value *tessCoordW = FSUB(FSUB(VIMMED1(1.0f), tessCoordU), tessCoordV, "tessCoordW"); - STORE(tessCoordW, tess_coord, {0, 2}); - system_values.tess_coord = wrap(tess_coord); - - // Primitive ID - system_values.prim_id = wrap(VBROADCAST(LOAD(pTesCtx, {0, SWR_DS_CONTEXT_PrimitiveID}), "PrimitiveID")); - - // Tessellation factors - Value* pPatch = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}); - Value* pTessFactors = GEP(pPatch, {C(0), C(ScalarPatch_tessFactors)}); - - assert(SWR_NUM_OUTER_TESS_FACTORS == 4); - Value* sys_value_outer_factors = UndefValue::get(getVectorType(mFP32Ty, 4)); - for (unsigned i = 0; i < SWR_NUM_OUTER_TESS_FACTORS; i++) { - Value* v = LOAD(pTessFactors, {0, SWR_TESSELLATION_FACTORS_OuterTessFactors, i}); - sys_value_outer_factors = VINSERT(sys_value_outer_factors, v, i, "gl_TessLevelOuter"); - } - system_values.tess_outer = wrap(sys_value_outer_factors); - - assert(SWR_NUM_INNER_TESS_FACTORS == 2); - Value* sys_value_inner_factors = UndefValue::get(getVectorType(mFP32Ty, 4)); - for (unsigned i = 0; i < SWR_NUM_INNER_TESS_FACTORS; i++) { - Value* v = LOAD(pTessFactors, {0, SWR_TESSELLATION_FACTORS_InnerTessFactors, i}); - sys_value_inner_factors = VINSERT(sys_value_inner_factors, v, i, "gl_TessLevelInner"); - } - system_values.tess_inner = wrap(sys_value_inner_factors); - - if (verbose_shader) - { - lp_build_print_value(gallivm, "tess_coord = ", system_values.tess_coord); - } - - struct tgsi_shader_info *pPrevShader = nullptr; - - if (ctx->tcs) { - pPrevShader = &ctx->tcs->info.base; - } - else { - pPrevShader = &ctx->vs->info.base; - } - - // Figure out how many per-patch attributes we have - unsigned perPatchAttrs = 0; - unsigned genericAttrs = 0; - unsigned tessLevelAttrs = 0; - unsigned sgvAttrs = 0; - for (unsigned slot = 0; slot < pPrevShader->num_outputs; slot++) { - switch (pPrevShader->output_semantic_name[slot]) { - case TGSI_SEMANTIC_PATCH: - perPatchAttrs++; - break; - case TGSI_SEMANTIC_GENERIC: - genericAttrs++; - break; - case TGSI_SEMANTIC_TESSINNER: - case TGSI_SEMANTIC_TESSOUTER: - tessLevelAttrs++; - break; - case TGSI_SEMANTIC_POSITION: - case TGSI_SEMANTIC_CLIPDIST: - case TGSI_SEMANTIC_PSIZE: - sgvAttrs++; - break; - default: - assert(!"Unknown semantic input in TES"); - } - } - - std::vector<Constant *> mapConstants; - Value *vtxAttribMap = ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); - Value *patchAttribMap = ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); - for (unsigned slot = 0; slot < info->num_inputs; slot++) { - ubyte semantic_name = info->input_semantic_name[slot]; - ubyte semantic_idx = info->input_semantic_index[slot]; - - // Where in TCS output is my attribute? - // TESS_TODO: revisit after implement pass-through TCS - unsigned tcs_slot = locate_linkage(semantic_name, semantic_idx, pPrevShader); - assert(tcs_slot < PIPE_MAX_SHADER_OUTPUTS); - - // Skip tessellation levels - these go to the tessellator, not TES - switch (semantic_name) { - case TGSI_SEMANTIC_GENERIC: - tcs_slot = tcs_slot + VERTEX_ATTRIB_START_SLOT - sgvAttrs - tessLevelAttrs; - break; - case TGSI_SEMANTIC_PATCH: - tcs_slot = semantic_idx; - break; - case TGSI_SEMANTIC_POSITION: - tcs_slot = VERTEX_POSITION_SLOT; - break; - case TGSI_SEMANTIC_CLIPDIST: - case TGSI_SEMANTIC_PSIZE: - break; - default: - assert(!"Unexpected semantic found while building TES input map"); - } - if (semantic_name == TGSI_SEMANTIC_PATCH) { - STORE(C(tcs_slot), patchAttribMap, {0, slot}); - } else { - STORE(C(tcs_slot), vtxAttribMap, {0, slot}); - } - mapConstants.push_back(C(tcs_slot)); - } - - // Build execution mask - struct lp_build_mask_context mask; - Value *mask_val = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_mask}, "tesMask"); - - if (verbose_shader) - lp_build_print_value(gallivm, "TES execution mask: ", wrap(mask_val)); - - lp_build_mask_begin(&mask, gallivm, - lp_type_float_vec(32, 32 * 8), wrap(mask_val)); - - struct swr_tes_llvm_iface tes_iface; - - tes_iface.base.fetch_vertex_input = ::swr_tes_llvm_fetch_vtx_input; - tes_iface.base.fetch_patch_input = ::swr_tes_llvm_fetch_patch_input; - - tes_iface.pBuilder = this; - tes_iface.pTesCtx = pTesCtx; - tes_iface.pTsState = pTS; - tes_iface.num_outputs = tes->info.base.num_outputs; - tes_iface.info = info; - tes_iface.pVtxAttribMap = vtxAttribMap; - tes_iface.pPatchAttribMap = patchAttribMap; - - struct lp_build_tgsi_params params; - memset(¶ms, 0, sizeof(params)); - params.type = lp_type_float_vec(32, 32 * 8); - params.mask = & mask; - params.consts_ptr = wrap(consts_ptr); - params.const_sizes_ptr = wrap(const_sizes_ptr); - params.system_values = &system_values; - params.inputs = inputs; - params.context_ptr = wrap(hPrivateData); - params.sampler = sampler; - params.info = &tes->info.base; - params.tes_iface = &tes_iface.base; - - // Build LLVM IR - lp_build_tgsi_soa(gallivm, - tes->pipe.tokens, - ¶ms, - outputs); - - lp_build_mask_end(&mask); - - sampler->destroy(sampler); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - // Write output attributes - Value *dclOut = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pOutputData}, "dclOut"); - - for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) { - for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { - if (!outputs[attrib][channel]) - continue; - - Value *val = LOAD(unwrap(outputs[attrib][channel]));; - Value *attribOffset = - LOAD(pTesCtx, {0, SWR_DS_CONTEXT_outVertexAttribOffset}); - - // Assume we write possition - Value* outputSlot = C(VERTEX_POSITION_SLOT); - if (tes->info.base.output_semantic_name[attrib] != TGSI_SEMANTIC_POSITION) { - // No, it's a generic attribute, not a position - let's calculate output slot - uint32_t outSlot = attrib; - if (tes->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) { - // this shader will write position, so in shader's term - // output starts at attrib 1, but we will handle that separately, - // so let's fix the outSlot - outSlot--; - } - outputSlot = ADD(attribOffset, C(outSlot)); - } - - Value *attribVecIndex = - ADD(MUL(vecStride, MUL(outputSlot, C(4))), vecOffset); - - uint32_t outputComponent = 0; - uint32_t curComp = outputComponent + channel; - auto outValIndex = ADD(attribVecIndex, MUL(vecStride, C(curComp))); - STOREV(val, dclOut, {outValIndex}); - - if (verbose_shader) { - lp_build_printf(gallivm, - "TES output [%d][%d]", - C(attrib), - C(channel)); - lp_build_print_value(gallivm, " = ", wrap(val)); - } - } - } - - RET_VOID(); - - JM()->DumpToFile(pFunction, "src"); - gallivm_verify_function(gallivm, wrap(pFunction)); - - gallivm_compile_module(gallivm); - JM()->DumpToFile(pFunction, "optimized"); - - PFN_TES_FUNC pFunc = - (PFN_TES_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); - - debug_printf("tess evaluation shader %p\n", pFunc); - assert(pFunc && "Error: TessEvaluationShader = NULL"); - - JM()->DumpAsm(pFunction, "asm"); - - JM()->mIsModuleFinalized = true; - - return pFunc; -} - -PFN_TCS_FUNC -BuilderSWR::CompileTCS(struct swr_context *ctx, swr_jit_tcs_key &key) -{ - SWR_TS_STATE *pTS = &ctx->tsState; - struct tgsi_shader_info *info = &ctx->tcs->info.base; - - pTS->numHsInputAttribs = info->num_inputs; - pTS->numHsOutputAttribs = info->num_outputs; - - pTS->hsAllocationSize = sizeof(ScalarPatch); - - pTS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; - pTS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; - - struct swr_tess_control_shader *tcs = ctx->tcs; - - LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; - LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; - - memset(outputs, 0, sizeof(outputs)); - - AttrBuilder attrBuilder; - attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); - - std::vector<Type *> tcsArgs{ - PointerType::get(Gen_swr_draw_context(JM()), 0), - PointerType::get(mInt8Ty, 0), - PointerType::get(Gen_SWR_HS_CONTEXT(JM()), 0)}; - FunctionType *tcsFuncType = - FunctionType::get(Type::getVoidTy(JM()->mContext), tcsArgs, false); - - // create new vertex shader function - auto pFunction = Function::Create(tcsFuncType, - GlobalValue::ExternalLinkage, - "TCS", - JM()->mpCurrentModule); - -#if LLVM_VERSION_MAJOR < 5 - AttributeSet attrSet = AttributeSet::get( - JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); - pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); -#else - pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder); -#endif - - BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); - IRB()->SetInsertPoint(block); - LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); - - auto argitr = pFunction->arg_begin(); - Value *hPrivateData = &*argitr++; - hPrivateData->setName("hPrivateData"); - Value *pWorkerData = &*argitr++; - pWorkerData->setName("pWorkerData"); - Value *pTcsCtx = &*argitr++; - pTcsCtx->setName("tcsCtx"); - - Value *consts_ptr = - GEP(hPrivateData, {C(0), C(swr_draw_context_constantTCS)}); - consts_ptr->setName("tcs_constants"); - Value *const_sizes_ptr = - GEP(hPrivateData, {0, swr_draw_context_num_constantsTCS}); - const_sizes_ptr->setName("num_tcs_constants"); - - struct lp_build_sampler_soa *sampler = - swr_sampler_soa_create(key.sampler, PIPE_SHADER_TESS_CTRL); - assert(sampler != nullptr); - - struct lp_bld_tgsi_system_values system_values; - memset(&system_values, 0, sizeof(system_values)); - - system_values.prim_id = - wrap(LOAD(pTcsCtx, {0, SWR_HS_CONTEXT_PrimitiveID})); - - system_values.invocation_id = wrap(VBROADCAST(C(0))); - system_values.vertices_in = wrap(C(tcs->vertices_per_patch)); - - if (verbose_shader) { - lp_build_print_value(gallivm, "TCS::prim_id = ", system_values.prim_id); - lp_build_print_value(gallivm, "TCS::invocation_id = ", system_values.invocation_id); - lp_build_print_value(gallivm, "TCS::vertices_in = ", system_values.vertices_in); - } - - std::vector<Constant *> mapConstants; - Value *vtxAttribMap = - ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); - - for (unsigned slot = 0; slot < info->num_inputs; slot++) { - ubyte semantic_name = info->input_semantic_name[slot]; - ubyte semantic_idx = info->input_semantic_index[slot]; - - unsigned vs_slot = - locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base); - assert(vs_slot < PIPE_MAX_SHADER_OUTPUTS); - - vs_slot += VERTEX_ATTRIB_START_SLOT; - - if (ctx->vs->info.base.output_semantic_name[0] - == TGSI_SEMANTIC_POSITION) - vs_slot--; - - if (semantic_name == TGSI_SEMANTIC_POSITION) - vs_slot = VERTEX_POSITION_SLOT; - - STORE(C(vs_slot), vtxAttribMap, {0, slot}); - mapConstants.push_back(C(vs_slot)); - } - - // Prepare map of output attributes. Needed when shader instance wants - // to read own output or output of other instance, which is allowed in TCS - Value *vtxOutputAttribMap = - ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); - // Map for per-patch attributes - Value *patchOutputAttribMap = - ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); - for (unsigned slot = 0; slot < info->num_outputs; slot++) { - ubyte name = info->output_semantic_name[slot]; - int32_t idx = info->output_semantic_index[slot]; - if (name == TGSI_SEMANTIC_PATCH) { - STORE(C(idx), patchOutputAttribMap, {0, slot}); - } else { - int32_t target_slot = slot; - if (name == TGSI_SEMANTIC_GENERIC) { - target_slot += VERTEX_ATTRIB_START_SLOT; - } - // Now normalize target slot - for (ubyte as = 0; as < slot; as++) { - ubyte name = info->output_semantic_name[as]; - switch (name) { - case TGSI_SEMANTIC_TESSOUTER: - case TGSI_SEMANTIC_TESSINNER: - case TGSI_SEMANTIC_PATCH: - case TGSI_SEMANTIC_POSITION: - target_slot--; - } - } - if (name == TGSI_SEMANTIC_POSITION) { - target_slot = VERTEX_POSITION_SLOT; - } - STORE(C(target_slot), vtxOutputAttribMap, {0, slot}); - mapConstants.push_back(C(target_slot)); - } - } - - struct lp_build_mask_context mask; - Value *mask_val = LOAD(pTcsCtx, {0, SWR_HS_CONTEXT_mask}, "tcsMask"); - lp_build_mask_begin( - &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(mask_val)); - - struct swr_tcs_llvm_iface tcs_iface; - - tcs_iface.base.emit_store_output = ::swr_tcs_llvm_store_output; - tcs_iface.base.emit_fetch_input = ::swr_tcs_llvm_fetch_input; - tcs_iface.base.emit_fetch_output = ::swr_tcs_llvm_fetch_output; - tcs_iface.base.emit_barrier = ::swr_tcs_llvm_emit_barrier; - tcs_iface.base.emit_prologue = ::swr_tcs_llvm_emit_prologue; - tcs_iface.base.emit_epilogue = ::swr_tcs_llvm_emit_epilogue; - - tcs_iface.pBuilder = this; - tcs_iface.pTcsCtx = pTcsCtx; - tcs_iface.pTsState = pTS; - tcs_iface.output_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; - tcs_iface.info = info; - tcs_iface.pVtxAttribMap = vtxAttribMap; - tcs_iface.pVtxOutputAttribMap = vtxOutputAttribMap; - tcs_iface.pPatchOutputAttribMap = patchOutputAttribMap; - - struct lp_build_tgsi_params params; - memset(¶ms, 0, sizeof(params)); - params.type = lp_type_float_vec(32, 32 * 8); - params.mask = &mask; - params.consts_ptr = wrap(consts_ptr); - params.const_sizes_ptr = wrap(const_sizes_ptr); - params.system_values = &system_values; - params.inputs = inputs; - params.context_ptr = wrap(hPrivateData); - params.sampler = sampler; - params.info = &tcs->info.base; - params.tcs_iface = &tcs_iface.base; - - lp_build_tgsi_soa(gallivm, tcs->pipe.tokens, ¶ms, outputs); - - lp_build_mask_end(&mask); - - sampler->destroy(sampler); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - RET_VOID(); - - JM()->DumpToFile(pFunction, "src"); - gallivm_verify_function(gallivm, wrap(pFunction)); - gallivm_compile_module(gallivm); - JM()->DumpToFile(pFunction, "optimized"); - - PFN_TCS_FUNC pFunc = - (PFN_TCS_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); - - debug_printf("tess control shader %p\n", pFunc); - assert(pFunc && "Error: TessControlShader = NULL"); - JM()->DumpAsm(pFunction, "asm"); - - JM()->mIsModuleFinalized = true; - - return pFunc; -} - - -PFN_GS_FUNC -swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key &key) -{ - BuilderSWR builder( - reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr), - "GS"); - PFN_GS_FUNC func = builder.CompileGS(ctx, key); - - ctx->gs->map.insert(std::make_pair(key, std::unique_ptr<VariantGS>(new VariantGS(builder.gallivm, func)))); - return func; -} - -PFN_TCS_FUNC -swr_compile_tcs(struct swr_context *ctx, swr_jit_tcs_key &key) -{ - BuilderSWR builder( - reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr), - "TCS"); - PFN_TCS_FUNC func = builder.CompileTCS(ctx, key); - - ctx->tcs->map.insert( - std::make_pair(key, std::unique_ptr<VariantTCS>(new VariantTCS(builder.gallivm, func)))); - - return func; -} - -PFN_TES_FUNC -swr_compile_tes(struct swr_context *ctx, swr_jit_tes_key &key) -{ - BuilderSWR builder( - reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr), - "TES"); - PFN_TES_FUNC func = builder.CompileTES(ctx, key); - - ctx->tes->map.insert( - std::make_pair(key, std::unique_ptr<VariantTES>(new VariantTES(builder.gallivm, func)))); - - return func; -} - -void -BuilderSWR::WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, unsigned slot, unsigned channel) -{ -#if USE_SIMD16_FRONTEND && !USE_SIMD16_VS - // interleave the simdvertex components into the dest simd16vertex - // slot16offset = slot8offset * 2 - // comp16offset = comp8offset * 2 + alternateOffset - - Value *offset = LOAD(pVsContext, { 0, SWR_VS_CONTEXT_AlternateOffset }); - Value *pOut = GEP(pVtxOutput, { C(0), C(0), C(slot * 2), offset } ); - STORE(pVal, pOut, {channel * 2}); -#else - Value *pOut = GEP(pVtxOutput, {0, 0, slot}); - STORE(pVal, pOut, {0, channel}); - if (verbose_vs_shader) { - lp_build_printf(gallivm, "VS: Storing on slot %d, channel %d: ", C(slot), C(channel)); - lp_build_print_value(gallivm, "", wrap(pVal)); - } -#endif -} - -PFN_VERTEX_FUNC -BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) -{ - struct swr_vertex_shader *swr_vs = ctx->vs; - - LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; - LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; - - memset(outputs, 0, sizeof(outputs)); - - AttrBuilder attrBuilder; - attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); - - std::vector<Type *> vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), - PointerType::get(mInt8Ty, 0), - PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)}; - FunctionType *vsFuncType = - FunctionType::get(Type::getVoidTy(JM()->mContext), vsArgs, false); - - // create new vertex shader function - auto pFunction = Function::Create(vsFuncType, - GlobalValue::ExternalLinkage, - "VS", - JM()->mpCurrentModule); -#if LLVM_VERSION_MAJOR < 5 - AttributeSet attrSet = AttributeSet::get( - JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); - pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); -#else - pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder); -#endif - - BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); - IRB()->SetInsertPoint(block); - LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); - - auto argitr = pFunction->arg_begin(); - Value *hPrivateData = &*argitr++; - hPrivateData->setName("hPrivateData"); - Value *pWorkerData = &*argitr++; - pWorkerData->setName("pWorkerData"); - Value *pVsCtx = &*argitr++; - pVsCtx->setName("vsCtx"); - - Value *consts_ptr = GEP(hPrivateData, {C(0), C(swr_draw_context_constantVS)}); - - consts_ptr->setName("vs_constants"); - Value *const_sizes_ptr = - GEP(hPrivateData, {0, swr_draw_context_num_constantsVS}); - const_sizes_ptr->setName("num_vs_constants"); - - Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin}); -#if USE_SIMD16_VS - vtxInput = BITCAST(vtxInput, PointerType::get(Gen_simd16vertex(JM()), 0)); -#endif - - for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) { - const unsigned mask = swr_vs->info.base.input_usage_mask[attrib]; - for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { - if (mask & (1 << channel)) { - inputs[attrib][channel] = - wrap(LOAD(vtxInput, {0, 0, attrib, channel})); - } - } - } - - struct lp_build_sampler_soa *sampler = - swr_sampler_soa_create(key.sampler, PIPE_SHADER_VERTEX); - assert(sampler != nullptr); - - struct lp_bld_tgsi_system_values system_values; - memset(&system_values, 0, sizeof(system_values)); - system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID})); - -#if USE_SIMD16_VS - system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID16})); -#else - system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID})); -#endif - -#if USE_SIMD16_VS - uint32_t vectorWidth = mVWidth16; -#else - uint32_t vectorWidth = mVWidth; -#endif - - struct lp_build_tgsi_params params; - memset(¶ms, 0, sizeof(params)); - params.type = lp_type_float_vec(32, 32 * vectorWidth); - params.consts_ptr = wrap(consts_ptr); - params.const_sizes_ptr = wrap(const_sizes_ptr); - params.system_values = &system_values; - params.inputs = inputs; - params.context_ptr = wrap(hPrivateData); - params.sampler = sampler; - params.info = &swr_vs->info.base; - - lp_build_tgsi_soa(gallivm, - swr_vs->pipe.tokens, - ¶ms, - outputs); - - sampler->destroy(sampler); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout}); -#if USE_SIMD16_VS - vtxOutput = BITCAST(vtxOutput, PointerType::get(Gen_simd16vertex(JM()), 0)); -#endif - - for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { - for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) { - if (!outputs[attrib][channel]) - continue; - - Value *val; - uint32_t outSlot; - - if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) { - if (channel != VERTEX_SGV_POINT_SIZE_COMP) - continue; - val = LOAD(unwrap(outputs[attrib][0])); - outSlot = VERTEX_SGV_SLOT; - } else if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_POSITION) { - val = LOAD(unwrap(outputs[attrib][channel])); - outSlot = VERTEX_POSITION_SLOT; - } else { - val = LOAD(unwrap(outputs[attrib][channel])); - outSlot = VERTEX_ATTRIB_START_SLOT + attrib; - if (swr_vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) - outSlot--; - } - - WriteVS(val, pVsCtx, vtxOutput, outSlot, channel); - } - } - - if (ctx->rasterizer->clip_plane_enable || - swr_vs->info.base.culldist_writemask) { - unsigned clip_mask = ctx->rasterizer->clip_plane_enable; - - unsigned cv = 0; - if (swr_vs->info.base.writes_clipvertex) { - cv = locate_linkage(TGSI_SEMANTIC_CLIPVERTEX, 0, - &swr_vs->info.base); - } else { - for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) { - if (swr_vs->info.base.output_semantic_name[i] == TGSI_SEMANTIC_POSITION && - swr_vs->info.base.output_semantic_index[i] == 0) { - cv = i; - break; - } - } - } - assert(cv < PIPE_MAX_SHADER_OUTPUTS); - LLVMValueRef cx = LLVMBuildLoad(gallivm->builder, outputs[cv][0], ""); - LLVMValueRef cy = LLVMBuildLoad(gallivm->builder, outputs[cv][1], ""); - LLVMValueRef cz = LLVMBuildLoad(gallivm->builder, outputs[cv][2], ""); - LLVMValueRef cw = LLVMBuildLoad(gallivm->builder, outputs[cv][3], ""); - - tgsi_shader_info *pLastFE = &ctx->vs->info.base; - - if (ctx->gs) { - pLastFE = &ctx->gs->info.base; - } - else if (ctx->tes) { - pLastFE = &ctx->tes->info.base; - } - else if (ctx->tcs) { - pLastFE = &ctx->tcs->info.base; - } - - for (unsigned val = 0; val < PIPE_MAX_CLIP_PLANES; val++) { - // clip distance overrides user clip planes - if ((pLastFE->clipdist_writemask & clip_mask & (1 << val)) || - ((pLastFE->culldist_writemask << pLastFE->num_written_clipdistance) & (1 << val))) { - unsigned cv = locate_linkage(TGSI_SEMANTIC_CLIPDIST, val < 4 ? 0 : 1, pLastFE); - assert(cv < PIPE_MAX_SHADER_OUTPUTS); - if (val < 4) { - LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val], ""); - WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val); - } else { - LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val - 4], ""); - WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4); - } - continue; - } - - if (!(clip_mask & (1 << val))) - continue; - - Value *px = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 0})); - Value *py = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 1})); - Value *pz = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 2})); - Value *pw = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 3})); -#if USE_SIMD16_VS - Value *bpx = VBROADCAST_16(px); - Value *bpy = VBROADCAST_16(py); - Value *bpz = VBROADCAST_16(pz); - Value *bpw = VBROADCAST_16(pw); -#else - Value *bpx = VBROADCAST(px); - Value *bpy = VBROADCAST(py); - Value *bpz = VBROADCAST(pz); - Value *bpw = VBROADCAST(pw); -#endif - Value *dist = FADD(FMUL(unwrap(cx), bpx), - FADD(FMUL(unwrap(cy), bpy), - FADD(FMUL(unwrap(cz), bpz), - FMUL(unwrap(cw), bpw)))); - - if (val < 4) - WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val); - else - WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4); - } - } - - RET_VOID(); - - JM()->DumpToFile(pFunction, "vs_function1"); - gallivm_verify_function(gallivm, wrap(pFunction)); - gallivm_compile_module(gallivm); - JM()->DumpToFile(pFunction, "vs_function2"); - - // lp_debug_dump_value(func); - - PFN_VERTEX_FUNC pFunc = - (PFN_VERTEX_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); - - JM()->DumpAsm(pFunction, "vs_function_asm"); - debug_printf("vert shader %p\n", pFunc); - assert(pFunc && "Error: VertShader = NULL"); - - JM()->mIsModuleFinalized = true; - - return pFunc; -} - -PFN_VERTEX_FUNC -swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key) -{ - if (!ctx->vs->pipe.tokens) - return NULL; - - BuilderSWR builder( - reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr), - "VS"); - PFN_VERTEX_FUNC func = builder.CompileVS(ctx, key); - - ctx->vs->map.insert(std::make_pair(key, std::unique_ptr<VariantVS>(new VariantVS(builder.gallivm, func)))); - return func; -} - -unsigned -swr_so_adjust_attrib(unsigned in_attrib, - swr_vertex_shader *swr_vs) -{ - ubyte semantic_name; - unsigned attrib; - - attrib = in_attrib + VERTEX_ATTRIB_START_SLOT; - - if (swr_vs) { - semantic_name = swr_vs->info.base.output_semantic_name[in_attrib]; - if (semantic_name == TGSI_SEMANTIC_POSITION) { - attrib = VERTEX_POSITION_SLOT; - } else if (semantic_name == TGSI_SEMANTIC_PSIZE) { - attrib = VERTEX_SGV_SLOT; - } else if (semantic_name == TGSI_SEMANTIC_LAYER) { - attrib = VERTEX_SGV_SLOT; - } else { - if (swr_vs->info.base.writes_position) { - attrib--; - } - } - } - - return attrib; -} - -static unsigned -locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info) -{ - for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) { - if ((info->output_semantic_name[i] == name) - && (info->output_semantic_index[i] == index)) { - return i; - } - } - - return 0xFFFFFFFF; -} - -PFN_PIXEL_KERNEL -BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) -{ - struct swr_fragment_shader *swr_fs = ctx->fs; - - struct tgsi_shader_info *pPrevShader; - if (ctx->gs) - pPrevShader = &ctx->gs->info.base; - else if (ctx->tes) - pPrevShader = &ctx->tes->info.base; - else - pPrevShader = &ctx->vs->info.base; - - LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; - LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; - - memset(inputs, 0, sizeof(inputs)); - memset(outputs, 0, sizeof(outputs)); - - struct lp_build_sampler_soa *sampler = NULL; - - AttrBuilder attrBuilder; - attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); - - std::vector<Type *> fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), - PointerType::get(mInt8Ty, 0), - PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)}; - FunctionType *funcType = - FunctionType::get(Type::getVoidTy(JM()->mContext), fsArgs, false); - - auto pFunction = Function::Create(funcType, - GlobalValue::ExternalLinkage, - "FS", - JM()->mpCurrentModule); -#if LLVM_VERSION_MAJOR < 5 - AttributeSet attrSet = AttributeSet::get( - JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); - pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); -#else - pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder); -#endif - - BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); - IRB()->SetInsertPoint(block); - LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); - - auto args = pFunction->arg_begin(); - Value *hPrivateData = &*args++; - hPrivateData->setName("hPrivateData"); - Value *pWorkerData = &*args++; - pWorkerData->setName("pWorkerData"); - Value *pPS = &*args++; - pPS->setName("psCtx"); - - Value *consts_ptr = GEP(hPrivateData, {0, swr_draw_context_constantFS}); - consts_ptr->setName("fs_constants"); - Value *const_sizes_ptr = - GEP(hPrivateData, {0, swr_draw_context_num_constantsFS}); - const_sizes_ptr->setName("num_fs_constants"); - - // load *pAttribs, *pPerspAttribs - Value *pRawAttribs = LOAD(pPS, {0, SWR_PS_CONTEXT_pAttribs}, "pRawAttribs"); - Value *pPerspAttribs = - LOAD(pPS, {0, SWR_PS_CONTEXT_pPerspAttribs}, "pPerspAttribs"); - - swr_fs->constantMask = 0; - swr_fs->flatConstantMask = 0; - swr_fs->pointSpriteMask = 0; - - for (int attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) { - const unsigned mask = swr_fs->info.base.input_usage_mask[attrib]; - const unsigned interpMode = swr_fs->info.base.input_interpolate[attrib]; - const unsigned interpLoc = swr_fs->info.base.input_interpolate_loc[attrib]; - - if (!mask) - continue; - - // load i,j - Value *vi = nullptr, *vj = nullptr; - switch (interpLoc) { - case TGSI_INTERPOLATE_LOC_CENTER: - vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_center}, "i"); - vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_center}, "j"); - break; - case TGSI_INTERPOLATE_LOC_CENTROID: - vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_centroid}, "i"); - vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_centroid}, "j"); - break; - case TGSI_INTERPOLATE_LOC_SAMPLE: - vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_sample}, "i"); - vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_sample}, "j"); - break; - } - - // load/compute w - Value *vw = nullptr, *pAttribs; - if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE || - interpMode == TGSI_INTERPOLATE_COLOR) { - pAttribs = pPerspAttribs; - switch (interpLoc) { - case TGSI_INTERPOLATE_LOC_CENTER: - vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center})); - break; - case TGSI_INTERPOLATE_LOC_CENTROID: - vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_centroid})); - break; - case TGSI_INTERPOLATE_LOC_SAMPLE: - vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_sample})); - break; - } - } else { - pAttribs = pRawAttribs; - vw = VIMMED1(1.f); - } - - vw->setName("w"); - - ubyte semantic_name = swr_fs->info.base.input_semantic_name[attrib]; - ubyte semantic_idx = swr_fs->info.base.input_semantic_index[attrib]; - - if (semantic_name == TGSI_SEMANTIC_FACE) { - Value *ff = - UI_TO_FP(LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), mFP32Ty); - ff = FSUB(FMUL(ff, C(2.0f)), C(1.0f)); - ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vFrontFace"); - - inputs[attrib][0] = wrap(ff); - inputs[attrib][1] = wrap(VIMMED1(0.0f)); - inputs[attrib][2] = wrap(VIMMED1(0.0f)); - inputs[attrib][3] = wrap(VIMMED1(1.0f)); - continue; - } else if (semantic_name == TGSI_SEMANTIC_POSITION) { // gl_FragCoord - if (swr_fs->info.base.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] == - TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER) { - inputs[attrib][0] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vX, PixelPositions_center}, "vX")); - inputs[attrib][1] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vY, PixelPositions_center}, "vY")); - } else { - inputs[attrib][0] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vX, PixelPositions_UL}, "vX")); - inputs[attrib][1] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vY, PixelPositions_UL}, "vY")); - } - inputs[attrib][2] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vZ}, "vZ")); - inputs[attrib][3] = - wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center}, "vOneOverW")); - continue; - } else if (semantic_name == TGSI_SEMANTIC_LAYER) { // gl_Layer - Value *ff = LOAD(pPS, {0, SWR_PS_CONTEXT_renderTargetArrayIndex}); - ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vRenderTargetArrayIndex"); - inputs[attrib][0] = wrap(ff); - inputs[attrib][1] = wrap(VIMMED1(0.0f)); - inputs[attrib][2] = wrap(VIMMED1(0.0f)); - inputs[attrib][3] = wrap(VIMMED1(0.0f)); - continue; - } else if (semantic_name == TGSI_SEMANTIC_VIEWPORT_INDEX) { // gl_ViewportIndex - Value *ff = LOAD(pPS, {0, SWR_PS_CONTEXT_viewportIndex}); - ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vViewportIndex"); - inputs[attrib][0] = wrap(ff); - inputs[attrib][1] = wrap(VIMMED1(0.0f)); - inputs[attrib][2] = wrap(VIMMED1(0.0f)); - inputs[attrib][3] = wrap(VIMMED1(0.0f)); - continue; - } - unsigned linkedAttrib = - locate_linkage(semantic_name, semantic_idx, pPrevShader) - 1; - - uint32_t extraAttribs = 0; - if (semantic_name == TGSI_SEMANTIC_PRIMID && !ctx->gs) { - /* non-gs generated primID - need to grab from swizzleMap override */ - linkedAttrib = pPrevShader->num_outputs - 1; - swr_fs->constantMask |= 1 << linkedAttrib; - extraAttribs++; - } else if (semantic_name == TGSI_SEMANTIC_GENERIC && - key.sprite_coord_enable & (1 << semantic_idx)) { - /* we add an extra attrib to the backendState in swr_update_derived. */ - linkedAttrib = pPrevShader->num_outputs + extraAttribs - 1; - swr_fs->pointSpriteMask |= (1 << linkedAttrib); - extraAttribs++; - } else if (linkedAttrib + 1 == 0xFFFFFFFF) { - inputs[attrib][0] = wrap(VIMMED1(0.0f)); - inputs[attrib][1] = wrap(VIMMED1(0.0f)); - inputs[attrib][2] = wrap(VIMMED1(0.0f)); - inputs[attrib][3] = wrap(VIMMED1(1.0f)); - /* If we're reading in color and 2-sided lighting is enabled, we have - * to keep going. - */ - if (semantic_name != TGSI_SEMANTIC_COLOR || !key.light_twoside) - continue; - } else { - if (interpMode == TGSI_INTERPOLATE_CONSTANT) { - swr_fs->constantMask |= 1 << linkedAttrib; - } else if (interpMode == TGSI_INTERPOLATE_COLOR) { - swr_fs->flatConstantMask |= 1 << linkedAttrib; - } - } - - unsigned bcolorAttrib = 0xFFFFFFFF; - Value *offset = NULL; - if (semantic_name == TGSI_SEMANTIC_COLOR && key.light_twoside) { - bcolorAttrib = locate_linkage( - TGSI_SEMANTIC_BCOLOR, semantic_idx, pPrevShader); - /* Neither front nor back colors were available. Nothing to load. */ - if (bcolorAttrib == 0xFFFFFFFF && linkedAttrib == 0xFFFFFFFF) - continue; - /* If there is no front color, just always use the back color. */ - if (linkedAttrib + 1 == 0xFFFFFFFF) - linkedAttrib = bcolorAttrib; - - if (bcolorAttrib != 0xFFFFFFFF) { - bcolorAttrib -= 1; - if (interpMode == TGSI_INTERPOLATE_CONSTANT) { - swr_fs->constantMask |= 1 << bcolorAttrib; - } else if (interpMode == TGSI_INTERPOLATE_COLOR) { - swr_fs->flatConstantMask |= 1 << bcolorAttrib; - } - - unsigned diff = 12 * (bcolorAttrib - linkedAttrib); - - if (diff) { - Value *back = - XOR(C(1), LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), "backFace"); - - offset = MUL(back, C(diff)); - offset->setName("offset"); - } - } - } - - for (int channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { - if (mask & (1 << channel)) { - Value *indexA = C(linkedAttrib * 12 + channel); - Value *indexB = C(linkedAttrib * 12 + channel + 4); - Value *indexC = C(linkedAttrib * 12 + channel + 8); - - if (offset) { - indexA = ADD(indexA, offset); - indexB = ADD(indexB, offset); - indexC = ADD(indexC, offset); - } - - Value *va = VBROADCAST(LOAD(GEP(pAttribs, indexA))); - Value *vb = VBROADCAST(LOAD(GEP(pAttribs, indexB))); - Value *vc = VBROADCAST(LOAD(GEP(pAttribs, indexC))); - - if (interpMode == TGSI_INTERPOLATE_CONSTANT) { - inputs[attrib][channel] = wrap(va); - } else { - Value *vk = FSUB(FSUB(VIMMED1(1.0f), vi), vj); - - vc = FMUL(vk, vc); - - Value *interp = FMUL(va, vi); - Value *interp1 = FMUL(vb, vj); - interp = FADD(interp, interp1); - interp = FADD(interp, vc); - if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE || - interpMode == TGSI_INTERPOLATE_COLOR) - interp = FMUL(interp, vw); - inputs[attrib][channel] = wrap(interp); - } - } - } - } - - sampler = swr_sampler_soa_create(key.sampler, PIPE_SHADER_FRAGMENT); - assert(sampler != nullptr); - - struct lp_bld_tgsi_system_values system_values; - memset(&system_values, 0, sizeof(system_values)); - - struct lp_build_mask_context mask; - bool uses_mask = false; - - if (swr_fs->info.base.uses_kill || - key.poly_stipple_enable) { - Value *vActiveMask = NULL; - if (swr_fs->info.base.uses_kill) { - vActiveMask = LOAD(pPS, {0, SWR_PS_CONTEXT_activeMask}, "activeMask"); - } - if (key.poly_stipple_enable) { - // first get fragment xy coords and clip to stipple bounds - Value *vXf = LOAD(pPS, {0, SWR_PS_CONTEXT_vX, PixelPositions_UL}); - Value *vYf = LOAD(pPS, {0, SWR_PS_CONTEXT_vY, PixelPositions_UL}); - Value *vXu = FP_TO_UI(vXf, mSimdInt32Ty); - Value *vYu = FP_TO_UI(vYf, mSimdInt32Ty); - - // stipple pattern is 32x32, which means that one line of stipple - // is stored in one word: - // vXstipple is bit offset inside 32-bit stipple word - // vYstipple is word index is stipple array - Value *vXstipple = AND(vXu, VIMMED1(0x1f)); // & (32-1) - Value *vYstipple = AND(vYu, VIMMED1(0x1f)); // & (32-1) - - // grab stipple pattern base address - Value *stipplePtr = GEP(hPrivateData, {0, swr_draw_context_polyStipple, 0}); - stipplePtr = BITCAST(stipplePtr, mInt8PtrTy); - - // peform a gather to grab stipple words for each lane - Value *vStipple = GATHERDD(VUNDEF_I(), stipplePtr, vYstipple, - VIMMED1(0xffffffff), 4); - - // create a mask with one bit corresponding to the x stipple - // and AND it with the pattern, to see if we have a bit - Value *vBitMask = LSHR(VIMMED1(0x80000000), vXstipple); - Value *vStippleMask = AND(vStipple, vBitMask); - vStippleMask = ICMP_NE(vStippleMask, VIMMED1(0)); - vStippleMask = VMASK(vStippleMask); - - if (swr_fs->info.base.uses_kill) { - vActiveMask = AND(vActiveMask, vStippleMask); - } else { - vActiveMask = vStippleMask; - } - } - lp_build_mask_begin( - &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(vActiveMask)); - uses_mask = true; - } - - struct lp_build_tgsi_params params; - memset(¶ms, 0, sizeof(params)); - params.type = lp_type_float_vec(32, 32 * 8); - params.mask = uses_mask ? &mask : NULL; - params.consts_ptr = wrap(consts_ptr); - params.const_sizes_ptr = wrap(const_sizes_ptr); - params.system_values = &system_values; - params.inputs = inputs; - params.context_ptr = wrap(hPrivateData); - params.sampler = sampler; - params.info = &swr_fs->info.base; - - lp_build_tgsi_soa(gallivm, - swr_fs->pipe.tokens, - ¶ms, - outputs); - - sampler->destroy(sampler); - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - for (uint32_t attrib = 0; attrib < swr_fs->info.base.num_outputs; - attrib++) { - switch (swr_fs->info.base.output_semantic_name[attrib]) { - case TGSI_SEMANTIC_POSITION: { - // write z - LLVMValueRef outZ = - LLVMBuildLoad(gallivm->builder, outputs[attrib][2], ""); - STORE(unwrap(outZ), pPS, {0, SWR_PS_CONTEXT_vZ}); - break; - } - case TGSI_SEMANTIC_COLOR: { - for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { - if (!outputs[attrib][channel]) - continue; - - LLVMValueRef out = - LLVMBuildLoad(gallivm->builder, outputs[attrib][channel], ""); - if (swr_fs->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && - swr_fs->info.base.output_semantic_index[attrib] == 0) { - for (uint32_t rt = 0; rt < key.nr_cbufs; rt++) { - STORE(unwrap(out), - pPS, - {0, SWR_PS_CONTEXT_shaded, rt, channel}); - } - } else { - STORE(unwrap(out), - pPS, - {0, - SWR_PS_CONTEXT_shaded, - swr_fs->info.base.output_semantic_index[attrib], - channel}); - } - } - break; - } - default: { - fprintf(stderr, - "unknown output from FS %s[%d]\n", - tgsi_semantic_names[swr_fs->info.base - .output_semantic_name[attrib]], - swr_fs->info.base.output_semantic_index[attrib]); - break; - } - } - } - - LLVMValueRef mask_result = 0; - if (uses_mask) { - mask_result = lp_build_mask_end(&mask); - } - - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - - if (uses_mask) { - STORE(unwrap(mask_result), pPS, {0, SWR_PS_CONTEXT_activeMask}); - } - - RET_VOID(); - - gallivm_verify_function(gallivm, wrap(pFunction)); - - gallivm_compile_module(gallivm); - - // after the gallivm passes, we have to lower the core's intrinsics - llvm::legacy::FunctionPassManager lowerPass(JM()->mpCurrentModule); - lowerPass.add(createLowerX86Pass(this)); - lowerPass.run(*pFunction); - - PFN_PIXEL_KERNEL kernel = - (PFN_PIXEL_KERNEL)gallivm_jit_function(gallivm, wrap(pFunction)); - debug_printf("frag shader %p\n", kernel); - assert(kernel && "Error: FragShader = NULL"); - - JM()->mIsModuleFinalized = true; - - return kernel; -} - -PFN_PIXEL_KERNEL -swr_compile_fs(struct swr_context *ctx, swr_jit_fs_key &key) -{ - if (!ctx->fs->pipe.tokens) - return NULL; - - BuilderSWR builder( - reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr), - "FS"); - PFN_PIXEL_KERNEL func = builder.CompileFS(ctx, key); - - ctx->fs->map.insert(std::make_pair(key, std::unique_ptr<VariantFS>(new VariantFS(builder.gallivm, func)))); - return func; -} diff --git a/src/gallium/drivers/swr/swr_shader.h b/src/gallium/drivers/swr/swr_shader.h deleted file mode 100644 index cabe915f312..00000000000 --- a/src/gallium/drivers/swr/swr_shader.h +++ /dev/null @@ -1,175 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#pragma once - -struct swr_vertex_shader; -struct swr_fragment_shader; -struct swr_geometry_shader; -struct swr_tess_control_shader; -struct swr_tess_evaluation_shader; - -struct swr_jit_fs_key; -struct swr_jit_vs_key; -struct swr_jit_gs_key; -struct swr_jit_tcs_key; -struct swr_jit_tes_key; - -using PFN_TCS_FUNC = PFN_HS_FUNC; -using PFN_TES_FUNC = PFN_DS_FUNC; - -unsigned swr_so_adjust_attrib(unsigned in_attrib, - swr_vertex_shader *swr_vs); - -PFN_VERTEX_FUNC -swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key); - -PFN_PIXEL_KERNEL -swr_compile_fs(struct swr_context *ctx, swr_jit_fs_key &key); - -PFN_GS_FUNC -swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key &key); - -PFN_TCS_FUNC -swr_compile_tcs(struct swr_context *ctx, swr_jit_tcs_key &key); - -PFN_TES_FUNC -swr_compile_tes(struct swr_context *ctx, swr_jit_tes_key &key); - -void swr_generate_fs_key(struct swr_jit_fs_key &key, - struct swr_context *ctx, - swr_fragment_shader *swr_fs); - -void swr_generate_vs_key(struct swr_jit_vs_key &key, - struct swr_context *ctx, - swr_vertex_shader *swr_vs); - -void swr_generate_fetch_key(struct swr_jit_fetch_key &key, - struct swr_vertex_element_state *velems); - -void swr_generate_gs_key(struct swr_jit_gs_key &key, - struct swr_context *ctx, - swr_geometry_shader *swr_gs); - -void swr_generate_tcs_key(struct swr_jit_tcs_key &key, - struct swr_context *ctx, - swr_tess_control_shader *swr_tcs); - -void swr_generate_tes_key(struct swr_jit_tes_key &key, - struct swr_context *ctx, - swr_tess_evaluation_shader *swr_tes); - -struct swr_jit_sampler_key { - unsigned nr_samplers; - unsigned nr_sampler_views; - struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS]; -}; - -struct swr_jit_fs_key : swr_jit_sampler_key { - unsigned nr_cbufs; - unsigned light_twoside; - unsigned sprite_coord_enable; - ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; - ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; - bool poly_stipple_enable; -}; - -struct swr_jit_vs_key : swr_jit_sampler_key { - unsigned clip_plane_mask; // from rasterizer state & vs_info -}; - -struct swr_jit_fetch_key { - FETCH_COMPILE_STATE fsState; -}; - -struct swr_jit_gs_key : swr_jit_sampler_key { - ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; - ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; -}; - -// TESS_TODO: revisit this - we probably need to use -// primitive modes, number of vertices emitted, etc. -struct swr_jit_tcs_key : swr_jit_sampler_key { - ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; - ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; - unsigned clip_plane_mask; // from rasterizer state & tcs_info -}; - -// TESS_TODO: revisit this -struct swr_jit_tes_key : swr_jit_sampler_key { - ubyte prev_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; - ubyte prev_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; - unsigned clip_plane_mask; // from rasterizer state & tes_info -}; - -namespace std -{ -template <> struct hash<swr_jit_fs_key> { - std::size_t operator()(const swr_jit_fs_key &k) const - { - return util_hash_crc32(&k, sizeof(k)); - } -}; - -template <> struct hash<swr_jit_vs_key> { - std::size_t operator()(const swr_jit_vs_key &k) const - { - return util_hash_crc32(&k, sizeof(k)); - } -}; - -template <> struct hash<swr_jit_fetch_key> { - std::size_t operator()(const swr_jit_fetch_key &k) const - { - return util_hash_crc32(&k, sizeof(k)); - } -}; - -template <> struct hash<swr_jit_gs_key> { - std::size_t operator()(const swr_jit_gs_key &k) const - { - return util_hash_crc32(&k, sizeof(k)); - } -}; - -template <> struct hash<swr_jit_tcs_key> { - std::size_t operator()(const swr_jit_tcs_key &k) const - { - return util_hash_crc32(&k, sizeof(k)); - } -}; - -template <> struct hash<swr_jit_tes_key> { - std::size_t operator()(const swr_jit_tes_key &k) const - { - return util_hash_crc32(&k, sizeof(k)); - } -}; -}; - -bool operator==(const swr_jit_fs_key &lhs, const swr_jit_fs_key &rhs); -bool operator==(const swr_jit_vs_key &lhs, const swr_jit_vs_key &rhs); -bool operator==(const swr_jit_fetch_key &lhs, const swr_jit_fetch_key &rhs); -bool operator==(const swr_jit_gs_key &lhs, const swr_jit_gs_key &rhs); -bool operator==(const swr_jit_tcs_key &lhs, const swr_jit_tcs_key &rhs); -bool operator==(const swr_jit_tes_key &lhs, const swr_jit_tes_key &rhs); diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp deleted file mode 100644 index 5f1464e6d0e..00000000000 --- a/src/gallium/drivers/swr/swr_state.cpp +++ /dev/null @@ -1,2243 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#include <llvm/Config/llvm-config.h> - -#if LLVM_VERSION_MAJOR < 7 -// llvm redefines DEBUG -#pragma push_macro("DEBUG") -#undef DEBUG -#endif - -#include <rasterizer/core/state.h> -#include "JitManager.h" - -#if LLVM_VERSION_MAJOR < 7 -#pragma pop_macro("DEBUG") -#endif - -#include "common/os.h" -#include "jit_api.h" -#include "gen_state_llvm.h" -#include "core/multisample.h" -#include "core/state_funcs.h" - -#include "gallivm/lp_bld_tgsi.h" -#include "util/format/u_format.h" - -#include "util/u_memory.h" -#include "util/u_inlines.h" -#include "util/u_helpers.h" -#include "util/u_framebuffer.h" -#include "util/u_viewport.h" -#include "util/u_prim.h" - -#include "swr_state.h" -#include "swr_context.h" -#include "gen_surf_state_llvm.h" -#include "gen_swr_context_llvm.h" -#include "swr_screen.h" -#include "swr_resource.h" -#include "swr_tex_sample.h" -#include "swr_scratch.h" -#include "swr_shader.h" -#include "swr_fence.h" - -/* These should be pulled out into separate files as necessary - * Just initializing everything here to get going. */ - -static void * -swr_create_blend_state(struct pipe_context *pipe, - const struct pipe_blend_state *blend) -{ - struct swr_blend_state *state = CALLOC_STRUCT(swr_blend_state); - assert(state != nullptr); - - memcpy(&state->pipe, blend, sizeof(*blend)); - - struct pipe_blend_state *pipe_blend = &state->pipe; - - for (int target = 0; - target < std::min(SWR_NUM_RENDERTARGETS, PIPE_MAX_COLOR_BUFS); - target++) { - - struct pipe_rt_blend_state *rt_blend = &pipe_blend->rt[target]; - SWR_RENDER_TARGET_BLEND_STATE &blendState = - state->blendState.renderTarget[target]; - RENDER_TARGET_BLEND_COMPILE_STATE &compileState = - state->compileState[target]; - - if (target != 0 && !pipe_blend->independent_blend_enable) { - memcpy(&compileState, - &state->compileState[0], - sizeof(RENDER_TARGET_BLEND_COMPILE_STATE)); - continue; - } - - compileState.blendEnable = rt_blend->blend_enable; - if (compileState.blendEnable) { - compileState.sourceAlphaBlendFactor = - swr_convert_blend_factor(rt_blend->alpha_src_factor); - compileState.destAlphaBlendFactor = - swr_convert_blend_factor(rt_blend->alpha_dst_factor); - compileState.sourceBlendFactor = - swr_convert_blend_factor(rt_blend->rgb_src_factor); - compileState.destBlendFactor = - swr_convert_blend_factor(rt_blend->rgb_dst_factor); - - compileState.colorBlendFunc = - swr_convert_blend_func(rt_blend->rgb_func); - compileState.alphaBlendFunc = - swr_convert_blend_func(rt_blend->alpha_func); - } - compileState.logicOpEnable = state->pipe.logicop_enable; - if (compileState.logicOpEnable) { - compileState.logicOpFunc = - swr_convert_logic_op(state->pipe.logicop_func); - } - - blendState.writeDisableRed = - (rt_blend->colormask & PIPE_MASK_R) ? 0 : 1; - blendState.writeDisableGreen = - (rt_blend->colormask & PIPE_MASK_G) ? 0 : 1; - blendState.writeDisableBlue = - (rt_blend->colormask & PIPE_MASK_B) ? 0 : 1; - blendState.writeDisableAlpha = - (rt_blend->colormask & PIPE_MASK_A) ? 0 : 1; - - if (rt_blend->colormask == 0) - compileState.blendEnable = false; - } - - return state; -} - -static void -swr_bind_blend_state(struct pipe_context *pipe, void *blend) -{ - struct swr_context *ctx = swr_context(pipe); - - if (ctx->blend == blend) - return; - - ctx->blend = (swr_blend_state *)blend; - - ctx->dirty |= SWR_NEW_BLEND; -} - -static void -swr_delete_blend_state(struct pipe_context *pipe, void *blend) -{ - FREE(blend); -} - -static void -swr_set_blend_color(struct pipe_context *pipe, - const struct pipe_blend_color *color) -{ - struct swr_context *ctx = swr_context(pipe); - - ctx->blend_color = *color; - - ctx->dirty |= SWR_NEW_BLEND; -} - -static void -swr_set_stencil_ref(struct pipe_context *pipe, - const struct pipe_stencil_ref ref) -{ - struct swr_context *ctx = swr_context(pipe); - - ctx->stencil_ref = ref; - - ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA; -} - -static void * -swr_create_depth_stencil_state( - struct pipe_context *pipe, - const struct pipe_depth_stencil_alpha_state *depth_stencil) -{ - struct pipe_depth_stencil_alpha_state *state; - - state = (pipe_depth_stencil_alpha_state *)mem_dup(depth_stencil, - sizeof *depth_stencil); - - return state; -} - -static void -swr_bind_depth_stencil_state(struct pipe_context *pipe, void *depth_stencil) -{ - struct swr_context *ctx = swr_context(pipe); - - if (ctx->depth_stencil == (pipe_depth_stencil_alpha_state *)depth_stencil) - return; - - ctx->depth_stencil = (pipe_depth_stencil_alpha_state *)depth_stencil; - - ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA; -} - -static void -swr_delete_depth_stencil_state(struct pipe_context *pipe, void *depth) -{ - FREE(depth); -} - - -static void * -swr_create_rasterizer_state(struct pipe_context *pipe, - const struct pipe_rasterizer_state *rast) -{ - struct pipe_rasterizer_state *state; - state = (pipe_rasterizer_state *)mem_dup(rast, sizeof *rast); - - return state; -} - -static void -swr_bind_rasterizer_state(struct pipe_context *pipe, void *handle) -{ - struct swr_context *ctx = swr_context(pipe); - const struct pipe_rasterizer_state *rasterizer = - (const struct pipe_rasterizer_state *)handle; - - if (ctx->rasterizer == (pipe_rasterizer_state *)rasterizer) - return; - - ctx->rasterizer = (pipe_rasterizer_state *)rasterizer; - - ctx->dirty |= SWR_NEW_RASTERIZER; -} - -static void -swr_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer) -{ - FREE(rasterizer); -} - - -static void * -swr_create_sampler_state(struct pipe_context *pipe, - const struct pipe_sampler_state *sampler) -{ - struct pipe_sampler_state *state = - (pipe_sampler_state *)mem_dup(sampler, sizeof *sampler); - - return state; -} - -static void -swr_bind_sampler_states(struct pipe_context *pipe, - enum pipe_shader_type shader, - unsigned start, - unsigned num, - void **samplers) -{ - struct swr_context *ctx = swr_context(pipe); - unsigned i; - - assert(shader < PIPE_SHADER_TYPES); - assert(start + num <= ARRAY_SIZE(ctx->samplers[shader])); - - /* set the new samplers */ - ctx->num_samplers[shader] = num; - for (i = 0; i < num; i++) { - ctx->samplers[shader][start + i] = (pipe_sampler_state *)samplers[i]; - } - - ctx->dirty |= SWR_NEW_SAMPLER; -} - -static void -swr_delete_sampler_state(struct pipe_context *pipe, void *sampler) -{ - FREE(sampler); -} - - -static struct pipe_sampler_view * -swr_create_sampler_view(struct pipe_context *pipe, - struct pipe_resource *texture, - const struct pipe_sampler_view *templ) -{ - struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view); - - if (view) { - *view = *templ; - view->reference.count = 1; - view->texture = NULL; - pipe_resource_reference(&view->texture, texture); - view->context = pipe; - } - - return view; -} - -static void -swr_set_sampler_views(struct pipe_context *pipe, - enum pipe_shader_type shader, - unsigned start, - unsigned num, - unsigned unbind_num_trailing_slots, - bool take_ownership, - struct pipe_sampler_view **views) -{ - struct swr_context *ctx = swr_context(pipe); - uint i; - - assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS); - - assert(shader < PIPE_SHADER_TYPES); - assert(start + num <= ARRAY_SIZE(ctx->sampler_views[shader])); - - /* set the new sampler views */ - ctx->num_sampler_views[shader] = num; - for (i = 0; i < num; i++) { - if (take_ownership) { - pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i], - NULL); - ctx->sampler_views[shader][start + i] = views[i]; - } else { - pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i], - views[i]); - } - } - for (; i < num + unbind_num_trailing_slots; i++) { - pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i], - NULL); - } - - ctx->dirty |= SWR_NEW_SAMPLER_VIEW; -} - -static void -swr_sampler_view_destroy(struct pipe_context *pipe, - struct pipe_sampler_view *view) -{ - pipe_resource_reference(&view->texture, NULL); - FREE(view); -} - -static void * -swr_create_vs_state(struct pipe_context *pipe, - const struct pipe_shader_state *vs) -{ - struct swr_vertex_shader *swr_vs = new swr_vertex_shader; - if (!swr_vs) - return NULL; - - swr_vs->pipe.tokens = tgsi_dup_tokens(vs->tokens); - swr_vs->pipe.stream_output = vs->stream_output; - - lp_build_tgsi_info(vs->tokens, &swr_vs->info); - - swr_vs->soState = {0}; - - if (swr_vs->pipe.stream_output.num_outputs) { - pipe_stream_output_info *stream_output = &swr_vs->pipe.stream_output; - - swr_vs->soState.soEnable = true; - // soState.rasterizerDisable set on state dirty - // soState.streamToRasterizer not used - - for (uint32_t i = 0; i < stream_output->num_outputs; i++) { - unsigned attrib_slot = stream_output->output[i].register_index; - attrib_slot = swr_so_adjust_attrib(attrib_slot, swr_vs); - swr_vs->soState.streamMasks[stream_output->output[i].stream] |= - (1 << attrib_slot); - } - for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) { - swr_vs->soState.streamNumEntries[i] = - _mm_popcnt_u32(swr_vs->soState.streamMasks[i]); - } - } - - return swr_vs; -} - -static void -swr_bind_vs_state(struct pipe_context *pipe, void *vs) -{ - struct swr_context *ctx = swr_context(pipe); - - if (ctx->vs == vs) - return; - - ctx->vs = (swr_vertex_shader *)vs; - ctx->dirty |= SWR_NEW_VS; -} - -static void -swr_delete_vs_state(struct pipe_context *pipe, void *vs) -{ - struct swr_vertex_shader *swr_vs = (swr_vertex_shader *)vs; - FREE((void *)swr_vs->pipe.tokens); - struct swr_screen *screen = swr_screen(pipe->screen); - - /* Defer deletion of vs state */ - swr_fence_work_delete_vs(screen->flush_fence, swr_vs); -} - -static void * -swr_create_fs_state(struct pipe_context *pipe, - const struct pipe_shader_state *fs) -{ - struct swr_fragment_shader *swr_fs = new swr_fragment_shader; - if (!swr_fs) - return NULL; - - swr_fs->pipe.tokens = tgsi_dup_tokens(fs->tokens); - - lp_build_tgsi_info(fs->tokens, &swr_fs->info); - - return swr_fs; -} - - -static void -swr_bind_fs_state(struct pipe_context *pipe, void *fs) -{ - struct swr_context *ctx = swr_context(pipe); - - if (ctx->fs == fs) - return; - - ctx->fs = (swr_fragment_shader *)fs; - ctx->dirty |= SWR_NEW_FS; -} - -static void -swr_delete_fs_state(struct pipe_context *pipe, void *fs) -{ - struct swr_fragment_shader *swr_fs = (swr_fragment_shader *)fs; - FREE((void *)swr_fs->pipe.tokens); - struct swr_screen *screen = swr_screen(pipe->screen); - - /* Defer deleton of fs state */ - swr_fence_work_delete_fs(screen->flush_fence, swr_fs); -} - -static void * -swr_create_gs_state(struct pipe_context *pipe, - const struct pipe_shader_state *gs) -{ - struct swr_geometry_shader *swr_gs = new swr_geometry_shader; - if (!swr_gs) - return NULL; - - swr_gs->pipe.tokens = tgsi_dup_tokens(gs->tokens); - lp_build_tgsi_info(gs->tokens, &swr_gs->info); - return swr_gs; -} - -static void -swr_bind_gs_state(struct pipe_context *pipe, void *gs) -{ - struct swr_context *ctx = swr_context(pipe); - - if (ctx->gs == gs) - return; - - ctx->gs = (swr_geometry_shader *)gs; - ctx->dirty |= SWR_NEW_GS; -} - -static void -swr_delete_gs_state(struct pipe_context *pipe, void *gs) -{ - struct swr_geometry_shader *swr_gs = (swr_geometry_shader *)gs; - FREE((void *)swr_gs->pipe.tokens); - struct swr_screen *screen = swr_screen(pipe->screen); - - /* Defer deleton of fs state */ - swr_fence_work_delete_gs(screen->flush_fence, swr_gs); -} - -static void * -swr_create_tcs_state(struct pipe_context *pipe, - const struct pipe_shader_state *tcs) -{ - struct swr_tess_control_shader *swr_tcs = new swr_tess_control_shader; - if (!swr_tcs) - return NULL; - - swr_tcs->pipe.tokens = tgsi_dup_tokens(tcs->tokens); - lp_build_tgsi_info(tcs->tokens, &swr_tcs->info); - return swr_tcs; -} - -static void -swr_bind_tcs_state(struct pipe_context *pipe, void *tcs) -{ - struct swr_context *ctx = swr_context(pipe); - - if (ctx->tcs == tcs) - return; - - ctx->tcs = (swr_tess_control_shader *)tcs; - ctx->dirty |= SWR_NEW_TCS; - ctx->dirty |= SWR_NEW_TS; -} - -static void -swr_delete_tcs_state(struct pipe_context *pipe, void *tcs) -{ - struct swr_tess_control_shader *swr_tcs = (swr_tess_control_shader *)tcs; - FREE((void *)swr_tcs->pipe.tokens); - struct swr_screen *screen = swr_screen(pipe->screen); - - /* Defer deleton of tcs state */ - swr_fence_work_delete_tcs(screen->flush_fence, swr_tcs); -} - -static void * -swr_create_tes_state(struct pipe_context *pipe, - const struct pipe_shader_state *tes) -{ - struct swr_tess_evaluation_shader *swr_tes = new swr_tess_evaluation_shader; - if (!swr_tes) - return NULL; - - swr_tes->pipe.tokens = tgsi_dup_tokens(tes->tokens); - lp_build_tgsi_info(tes->tokens, &swr_tes->info); - return swr_tes; -} - -static void -swr_bind_tes_state(struct pipe_context *pipe, void *tes) -{ - struct swr_context *ctx = swr_context(pipe); - - if (ctx->tes == tes) - return; - - // Save current tessellator state first - if (ctx->tes != nullptr) { - ctx->tes->ts_state = ctx->tsState; - } - - ctx->tes = (swr_tess_evaluation_shader *)tes; - - ctx->dirty |= SWR_NEW_TES; - ctx->dirty |= SWR_NEW_TS; -} - -static void -swr_delete_tes_state(struct pipe_context *pipe, void *tes) -{ - struct swr_tess_evaluation_shader *swr_tes = (swr_tess_evaluation_shader *)tes; - FREE((void *)swr_tes->pipe.tokens); - struct swr_screen *screen = swr_screen(pipe->screen); - - /* Defer deleton of tes state */ - swr_fence_work_delete_tes(screen->flush_fence, swr_tes); -} - -static void -swr_set_constant_buffer(struct pipe_context *pipe, - enum pipe_shader_type shader, - uint index, bool take_ownership, - const struct pipe_constant_buffer *cb) -{ - struct swr_context *ctx = swr_context(pipe); - struct pipe_resource *constants = cb ? cb->buffer : NULL; - - assert(shader < PIPE_SHADER_TYPES); - assert(index < ARRAY_SIZE(ctx->constants[shader])); - - /* note: reference counting */ - util_copy_constant_buffer(&ctx->constants[shader][index], cb, take_ownership); - - if (shader == PIPE_SHADER_VERTEX) { - ctx->dirty |= SWR_NEW_VSCONSTANTS; - } else if (shader == PIPE_SHADER_FRAGMENT) { - ctx->dirty |= SWR_NEW_FSCONSTANTS; - } else if (shader == PIPE_SHADER_GEOMETRY) { - ctx->dirty |= SWR_NEW_GSCONSTANTS; - } else if (shader == PIPE_SHADER_TESS_CTRL) { - ctx->dirty |= SWR_NEW_TCSCONSTANTS; - } else if (shader == PIPE_SHADER_TESS_EVAL) { - ctx->dirty |= SWR_NEW_TESCONSTANTS; - } - if (cb && cb->user_buffer) { - pipe_resource_reference(&constants, NULL); - } -} - - -static void * -swr_create_vertex_elements_state(struct pipe_context *pipe, - unsigned num_elements, - const struct pipe_vertex_element *attribs) -{ - struct swr_vertex_element_state *velems; - assert(num_elements <= PIPE_MAX_ATTRIBS); - velems = new swr_vertex_element_state; - if (velems) { - memset((void*)&velems->fsState, 0, sizeof(velems->fsState)); - velems->fsState.bVertexIDOffsetEnable = true; - velems->fsState.numAttribs = num_elements; - for (unsigned i = 0; i < num_elements; i++) { - // XXX: we should do this keyed on the VS usage info - - const struct util_format_description *desc = - util_format_description((enum pipe_format)attribs[i].src_format); - - velems->fsState.layout[i].AlignedByteOffset = attribs[i].src_offset; - velems->fsState.layout[i].Format = - mesa_to_swr_format((enum pipe_format)attribs[i].src_format); - velems->fsState.layout[i].StreamIndex = - attribs[i].vertex_buffer_index; - velems->fsState.layout[i].InstanceEnable = - attribs[i].instance_divisor != 0; - velems->fsState.layout[i].ComponentControl0 = - desc->channel[0].type != UTIL_FORMAT_TYPE_VOID - ? ComponentControl::StoreSrc - : ComponentControl::Store0; - velems->fsState.layout[i].ComponentControl1 = - desc->channel[1].type != UTIL_FORMAT_TYPE_VOID - ? ComponentControl::StoreSrc - : ComponentControl::Store0; - velems->fsState.layout[i].ComponentControl2 = - desc->channel[2].type != UTIL_FORMAT_TYPE_VOID - ? ComponentControl::StoreSrc - : ComponentControl::Store0; - velems->fsState.layout[i].ComponentControl3 = - desc->channel[3].type != UTIL_FORMAT_TYPE_VOID - ? ComponentControl::StoreSrc - : ComponentControl::Store1Fp; - velems->fsState.layout[i].ComponentPacking = ComponentEnable::XYZW; - velems->fsState.layout[i].InstanceAdvancementState = - attribs[i].instance_divisor; - - /* Calculate the pitch of each stream */ - const SWR_FORMAT_INFO &swr_desc = GetFormatInfo( - mesa_to_swr_format((enum pipe_format)attribs[i].src_format)); - velems->stream_pitch[attribs[i].vertex_buffer_index] += swr_desc.Bpp; - - if (attribs[i].instance_divisor != 0) { - velems->instanced_bufs |= 1U << attribs[i].vertex_buffer_index; - uint32_t *min_instance_div = - &velems->min_instance_div[attribs[i].vertex_buffer_index]; - if (!*min_instance_div || - attribs[i].instance_divisor < *min_instance_div) - *min_instance_div = attribs[i].instance_divisor; - } - } - } - - return velems; -} - -static void -swr_bind_vertex_elements_state(struct pipe_context *pipe, void *velems) -{ - struct swr_context *ctx = swr_context(pipe); - struct swr_vertex_element_state *swr_velems = - (struct swr_vertex_element_state *)velems; - - ctx->velems = swr_velems; - ctx->dirty |= SWR_NEW_VERTEX; -} - -static void -swr_delete_vertex_elements_state(struct pipe_context *pipe, void *velems) -{ - struct swr_vertex_element_state *swr_velems = - (struct swr_vertex_element_state *) velems; - /* XXX Need to destroy fetch shader? */ - delete swr_velems; -} - - -static void -swr_set_vertex_buffers(struct pipe_context *pipe, - unsigned start_slot, - unsigned num_elements, - unsigned unbind_num_trailing_slots, - bool take_ownership, - const struct pipe_vertex_buffer *buffers) -{ - struct swr_context *ctx = swr_context(pipe); - - assert(num_elements <= PIPE_MAX_ATTRIBS); - - util_set_vertex_buffers_count(ctx->vertex_buffer, - &ctx->num_vertex_buffers, - buffers, - start_slot, - num_elements, - unbind_num_trailing_slots, - take_ownership); - - ctx->dirty |= SWR_NEW_VERTEX; -} - - -static void -swr_set_polygon_stipple(struct pipe_context *pipe, - const struct pipe_poly_stipple *stipple) -{ - struct swr_context *ctx = swr_context(pipe); - - ctx->poly_stipple.pipe = *stipple; /* struct copy */ - ctx->dirty |= SWR_NEW_STIPPLE; -} - -static void -swr_set_clip_state(struct pipe_context *pipe, - const struct pipe_clip_state *clip) -{ - struct swr_context *ctx = swr_context(pipe); - - ctx->clip = *clip; - /* XXX Unimplemented, but prevents crash */ - - ctx->dirty |= SWR_NEW_CLIP; -} - - -static void -swr_set_scissor_states(struct pipe_context *pipe, - unsigned start_slot, - unsigned num_scissors, - const struct pipe_scissor_state *scissors) -{ - struct swr_context *ctx = swr_context(pipe); - - memcpy(ctx->scissors + start_slot, scissors, - sizeof(struct pipe_scissor_state) * num_scissors); - - for (unsigned i = 0; i < num_scissors; i++) { - auto idx = start_slot + i; - ctx->swr_scissors[idx].xmin = scissors[idx].minx; - ctx->swr_scissors[idx].xmax = scissors[idx].maxx; - ctx->swr_scissors[idx].ymin = scissors[idx].miny; - ctx->swr_scissors[idx].ymax = scissors[idx].maxy; - } - ctx->dirty |= SWR_NEW_SCISSOR; -} - -static void -swr_set_viewport_states(struct pipe_context *pipe, - unsigned start_slot, - unsigned num_viewports, - const struct pipe_viewport_state *vpt) -{ - struct swr_context *ctx = swr_context(pipe); - - memcpy(ctx->viewports + start_slot, vpt, sizeof(struct pipe_viewport_state) * num_viewports); - ctx->dirty |= SWR_NEW_VIEWPORT; -} - - -static void -swr_set_framebuffer_state(struct pipe_context *pipe, - const struct pipe_framebuffer_state *fb) -{ - struct swr_context *ctx = swr_context(pipe); - - bool changed = !util_framebuffer_state_equal(&ctx->framebuffer, fb); - - assert(fb->width <= KNOB_GUARDBAND_WIDTH); - assert(fb->height <= KNOB_GUARDBAND_HEIGHT); - - if (changed) { - util_copy_framebuffer_state(&ctx->framebuffer, fb); - - /* 0 and 1 both indicate no msaa. Core doesn't understand 0 samples */ - ctx->framebuffer.samples = std::max((ubyte)1, ctx->framebuffer.samples); - - ctx->dirty |= SWR_NEW_FRAMEBUFFER; - } -} - - -static void -swr_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask) -{ - struct swr_context *ctx = swr_context(pipe); - - if (sample_mask != ctx->sample_mask) { - ctx->sample_mask = sample_mask; - ctx->dirty |= SWR_NEW_RASTERIZER; - } -} - -/* - * MSAA fixed sample position table - * used by update_derived and get_sample_position - * (integer locations on a 16x16 grid) - */ -static const uint8_t swr_sample_positions[][2] = -{ /* 1x*/ { 8, 8}, - /* 2x*/ {12,12},{ 4, 4}, - /* 4x*/ { 6, 2},{14, 6},{ 2,10},{10,14}, - /* 8x*/ { 9, 5},{ 7,11},{13, 9},{ 5, 3}, - { 3,13},{ 1, 7},{11,15},{15, 1}, - /*16x*/ { 9, 9},{ 7, 5},{ 5,10},{12, 7}, - { 3, 6},{10,13},{13,11},{11, 3}, - { 6,14},{ 8, 1},{ 4, 2},{ 2,12}, - { 0, 8},{15, 4},{14,15},{ 1, 0} }; - -static void -swr_get_sample_position(struct pipe_context *pipe, - unsigned sample_count, unsigned sample_index, - float *out_value) -{ - /* validate sample_count */ - sample_count = GetNumSamples(GetSampleCount(sample_count)); - - const uint8_t *sample = swr_sample_positions[sample_count-1 + sample_index]; - out_value[0] = sample[0] / 16.0f; - out_value[1] = sample[1] / 16.0f; -} - - -/* - * Update resource in-use status - * All resources bound to color or depth targets marked as WRITE resources. - * VBO Vertex/index buffers and texture views marked as READ resources. - */ -void -swr_update_resource_status(struct pipe_context *pipe, - const struct pipe_draw_info *p_draw_info) -{ - struct swr_context *ctx = swr_context(pipe); - struct pipe_framebuffer_state *fb = &ctx->framebuffer; - - /* colorbuffer targets */ - if (fb->nr_cbufs) - for (uint32_t i = 0; i < fb->nr_cbufs; ++i) - if (fb->cbufs[i]) - swr_resource_write(fb->cbufs[i]->texture); - - /* depth/stencil target */ - if (fb->zsbuf) - swr_resource_write(fb->zsbuf->texture); - - /* VBO vertex buffers */ - for (uint32_t i = 0; i < ctx->num_vertex_buffers; i++) { - struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i]; - if (!vb->is_user_buffer && vb->buffer.resource) - swr_resource_read(vb->buffer.resource); - } - - /* VBO index buffer */ - if (p_draw_info && p_draw_info->index_size) { - if (!p_draw_info->has_user_indices) - swr_resource_read(p_draw_info->index.resource); - } - - /* transform feedback buffers */ - for (uint32_t i = 0; i < ctx->num_so_targets; i++) { - struct pipe_stream_output_target *target = ctx->so_targets[i]; - if (target && target->buffer) - swr_resource_write(target->buffer); - } - - /* texture sampler views */ - for (uint32_t j : {PIPE_SHADER_VERTEX, PIPE_SHADER_FRAGMENT}) { - for (uint32_t i = 0; i < ctx->num_sampler_views[j]; i++) { - struct pipe_sampler_view *view = ctx->sampler_views[j][i]; - if (view) - swr_resource_read(view->texture); - } - } - - /* constant buffers */ - for (uint32_t j : {PIPE_SHADER_VERTEX, PIPE_SHADER_FRAGMENT}) { - for (uint32_t i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { - struct pipe_constant_buffer *cb = &ctx->constants[j][i]; - if (cb->buffer) - swr_resource_read(cb->buffer); - } - } -} - -static void -swr_update_texture_state(struct swr_context *ctx, - enum pipe_shader_type shader_type, - unsigned num_sampler_views, - swr_jit_texture *textures) -{ - for (unsigned i = 0; i < num_sampler_views; i++) { - struct pipe_sampler_view *view = - ctx->sampler_views[shader_type][i]; - struct swr_jit_texture *jit_tex = &textures[i]; - - memset(jit_tex, 0, sizeof(*jit_tex)); - if (view) { - struct pipe_resource *res = view->texture; - struct swr_resource *swr_res = swr_resource(res); - SWR_SURFACE_STATE *swr = &swr_res->swr; - size_t *mip_offsets = swr_res->mip_offsets; - if (swr_res->has_depth && swr_res->has_stencil && - !util_format_has_depth(util_format_description(view->format))) { - swr = &swr_res->secondary; - mip_offsets = swr_res->secondary_mip_offsets; - } - - jit_tex->width = res->width0; - jit_tex->height = res->height0; - jit_tex->base_ptr = (uint8_t*)swr->xpBaseAddress; - jit_tex->num_samples = swr->numSamples; - jit_tex->sample_stride = 0; - if (view->target != PIPE_BUFFER) { - jit_tex->first_level = view->u.tex.first_level; - jit_tex->last_level = view->u.tex.last_level; - if (view->target == PIPE_TEXTURE_3D) - jit_tex->depth = res->depth0; - else - jit_tex->depth = - view->u.tex.last_layer - view->u.tex.first_layer + 1; - jit_tex->base_ptr += view->u.tex.first_layer * - swr->qpitch * swr->pitch; - } else { - unsigned view_blocksize = util_format_get_blocksize(view->format); - jit_tex->base_ptr += view->u.buf.offset; - jit_tex->width = view->u.buf.size / view_blocksize; - jit_tex->depth = 1; - } - - for (unsigned level = jit_tex->first_level; - level <= jit_tex->last_level; - level++) { - jit_tex->row_stride[level] = swr->pitch; - jit_tex->img_stride[level] = swr->qpitch * swr->pitch; - jit_tex->mip_offsets[level] = mip_offsets[level]; - } - } - } -} - -static void -swr_update_sampler_state(struct swr_context *ctx, - enum pipe_shader_type shader_type, - unsigned num_samplers, - swr_jit_sampler *samplers) -{ - for (unsigned i = 0; i < num_samplers; i++) { - const struct pipe_sampler_state *sampler = - ctx->samplers[shader_type][i]; - - if (sampler) { - samplers[i].min_lod = sampler->min_lod; - samplers[i].max_lod = sampler->max_lod; - samplers[i].lod_bias = sampler->lod_bias; - COPY_4V(samplers[i].border_color, sampler->border_color.f); - } - } -} - -static void -swr_update_constants(struct swr_context *ctx, enum pipe_shader_type shaderType) -{ - swr_draw_context *pDC = &ctx->swrDC; - - const float **constant; - uint32_t *num_constants; - struct swr_scratch_space *scratch; - - switch (shaderType) { - case PIPE_SHADER_VERTEX: - constant = pDC->constantVS; - num_constants = pDC->num_constantsVS; - scratch = &ctx->scratch->vs_constants; - break; - case PIPE_SHADER_FRAGMENT: - constant = pDC->constantFS; - num_constants = pDC->num_constantsFS; - scratch = &ctx->scratch->fs_constants; - break; - case PIPE_SHADER_GEOMETRY: - constant = pDC->constantGS; - num_constants = pDC->num_constantsGS; - scratch = &ctx->scratch->gs_constants; - break; - case PIPE_SHADER_TESS_CTRL: - constant = pDC->constantTCS; - num_constants = pDC->num_constantsTCS; - scratch = &ctx->scratch->tcs_constants; - break; - case PIPE_SHADER_TESS_EVAL: - constant = pDC->constantTES; - num_constants = pDC->num_constantsTES; - scratch = &ctx->scratch->tes_constants; - break; - default: - assert(0 && "Unsupported shader type constants"); - return; - } - - for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { - const pipe_constant_buffer *cb = &ctx->constants[shaderType][i]; - num_constants[i] = cb->buffer_size; - if (cb->buffer) { - constant[i] = - (const float *)(swr_resource_data(cb->buffer) + - cb->buffer_offset); - } else { - /* Need to copy these constants to scratch space */ - if (cb->user_buffer && cb->buffer_size) { - const void *ptr = - ((const uint8_t *)cb->user_buffer + cb->buffer_offset); - uint32_t size = AlignUp(cb->buffer_size, 4); - ptr = swr_copy_to_scratch_space(ctx, scratch, ptr, size); - constant[i] = (const float *)ptr; - } - } - } -} - -static bool -swr_change_rt(struct swr_context *ctx, - unsigned attachment, - const struct pipe_surface *sf) -{ - swr_draw_context *pDC = &ctx->swrDC; - struct SWR_SURFACE_STATE *rt = &pDC->renderTargets[attachment]; - - /* Do nothing if the render target hasn't changed */ - if ((!sf || !sf->texture) && (void*)(rt->xpBaseAddress) == nullptr) - return false; - - /* Deal with disabling RT up front */ - if (!sf || !sf->texture) { - /* If detaching attachment, mark tiles as RESOLVED so core - * won't try to load from non-existent target. */ - swr_store_render_target(&ctx->pipe, attachment, SWR_TILE_RESOLVED); - *rt = {0}; - return true; - } - - const struct swr_resource *swr = swr_resource(sf->texture); - const SWR_SURFACE_STATE *swr_surface = &swr->swr; - SWR_FORMAT fmt = mesa_to_swr_format(sf->format); - - if (attachment == SWR_ATTACHMENT_STENCIL && swr->secondary.xpBaseAddress) { - swr_surface = &swr->secondary; - fmt = swr_surface->format; - } - - if (rt->xpBaseAddress == swr_surface->xpBaseAddress && - rt->format == fmt && - rt->lod == sf->u.tex.level && - rt->arrayIndex == sf->u.tex.first_layer) - return false; - - bool need_fence = false; - - /* StoreTile for changed target */ - if (rt->xpBaseAddress) { - /* If changing attachment to a new target, mark tiles as - * INVALID so they are reloaded from surface. */ - swr_store_render_target(&ctx->pipe, attachment, SWR_TILE_INVALID); - need_fence = true; - } else { - /* if no previous attachment, invalidate tiles that may be marked - * RESOLVED because of an old attachment */ - swr_invalidate_render_target(&ctx->pipe, attachment, sf->width, sf->height); - /* no need to set fence here */ - } - - /* Make new attachment */ - *rt = *swr_surface; - rt->format = fmt; - rt->lod = sf->u.tex.level; - rt->arrayIndex = sf->u.tex.first_layer; - - return need_fence; -} - -/* - * for cases where resources are shared between contexts, invalidate - * this ctx's resource. so it can be fetched fresh. Old ctx's resource - * is already stored during a flush - */ -static inline void -swr_invalidate_buffers_after_ctx_change(struct pipe_context *pipe) -{ - struct swr_context *ctx = swr_context(pipe); - - for (uint32_t i = 0; i < ctx->framebuffer.nr_cbufs; i++) { - struct pipe_surface *cb = ctx->framebuffer.cbufs[i]; - if (cb) { - struct swr_resource *res = swr_resource(cb->texture); - if (res->curr_pipe != pipe) { - /* if curr_pipe is NULL (first use), status should not be WRITE */ - assert(res->curr_pipe || !(res->status & SWR_RESOURCE_WRITE)); - if (res->status & SWR_RESOURCE_WRITE) { - swr_invalidate_render_target(pipe, i, cb->width, cb->height); - } - } - res->curr_pipe = pipe; - } - } - if (ctx->framebuffer.zsbuf) { - struct pipe_surface *zb = ctx->framebuffer.zsbuf; - if (zb) { - struct swr_resource *res = swr_resource(zb->texture); - if (res->curr_pipe != pipe) { - /* if curr_pipe is NULL (first use), status should not be WRITE */ - assert(res->curr_pipe || !(res->status & SWR_RESOURCE_WRITE)); - if (res->status & SWR_RESOURCE_WRITE) { - swr_invalidate_render_target(pipe, SWR_ATTACHMENT_DEPTH, zb->width, zb->height); - swr_invalidate_render_target(pipe, SWR_ATTACHMENT_STENCIL, zb->width, zb->height); - } - } - res->curr_pipe = pipe; - } - } -} - -static inline void -swr_user_vbuf_range(const struct pipe_draw_info *info, - const struct swr_vertex_element_state *velems, - const struct pipe_vertex_buffer *vb, - uint32_t i, - uint32_t *totelems, - uint32_t *base, - uint32_t *size, - int index_bias) -{ - /* FIXME: The size is too large - we don't access the full extra stride. */ - unsigned elems; - unsigned elem_pitch = vb->stride + velems->stream_pitch[i]; - if (velems->instanced_bufs & (1U << i)) { - elems = info->instance_count / velems->min_instance_div[i] + 1; - *totelems = info->start_instance + elems; - *base = info->start_instance * vb->stride; - *size = elems * elem_pitch; - } else if (vb->stride) { - elems = info->max_index - info->min_index + 1; - *totelems = (info->max_index + (info->index_size ? index_bias : 0)) + 1; - *base = (info->min_index + (info->index_size ? index_bias : 0)) * vb->stride; - *size = elems * elem_pitch; - } else { - *totelems = 1; - *base = 0; - *size = velems->stream_pitch[i]; - } -} - -static void -swr_update_poly_stipple(struct swr_context *ctx) -{ - struct swr_draw_context *pDC = &ctx->swrDC; - - assert(sizeof(ctx->poly_stipple.pipe.stipple) == sizeof(pDC->polyStipple)); - memcpy(pDC->polyStipple, - ctx->poly_stipple.pipe.stipple, - sizeof(ctx->poly_stipple.pipe.stipple)); -} - - -static struct tgsi_shader_info * -swr_get_last_fe(const struct swr_context *ctx) -{ - tgsi_shader_info *pLastFE = &ctx->vs->info.base; - - if (ctx->gs) { - pLastFE = &ctx->gs->info.base; - } - else if (ctx->tes) { - pLastFE = &ctx->tes->info.base; - } - else if (ctx->tcs) { - pLastFE = &ctx->tcs->info.base; - } - return pLastFE; -} - - -void -swr_update_derived(struct pipe_context *pipe, - const struct pipe_draw_info *p_draw_info, - const struct pipe_draw_start_count_bias *draw) -{ - struct swr_context *ctx = swr_context(pipe); - struct swr_screen *screen = swr_screen(pipe->screen); - - /* When called from swr_clear (p_draw_info = null), set any null - * state-objects to the dummy state objects to prevent nullptr dereference - * in validation below. - * - * Important that this remains static for zero initialization. These - * aren't meant to be proper state objects, just empty structs. They will - * not be written to. - * - * Shaders can't be part of the union since they contain std::unordered_map - */ - static struct { - union { - struct pipe_rasterizer_state rasterizer; - struct pipe_depth_stencil_alpha_state depth_stencil; - struct swr_blend_state blend; - } state; - struct swr_vertex_shader vs; - struct swr_fragment_shader fs; - } swr_dummy; - - if (!p_draw_info) { - if (!ctx->rasterizer) - ctx->rasterizer = &swr_dummy.state.rasterizer; - if (!ctx->depth_stencil) - ctx->depth_stencil = &swr_dummy.state.depth_stencil; - if (!ctx->blend) - ctx->blend = &swr_dummy.state.blend; - if (!ctx->vs) - ctx->vs = &swr_dummy.vs; - if (!ctx->fs) - ctx->fs = &swr_dummy.fs; - } - - /* Update screen->pipe to current pipe context. */ - screen->pipe = pipe; - - /* Any state that requires dirty flags to be re-triggered sets this mask */ - /* For example, user_buffer vertex and index buffers. */ - unsigned post_update_dirty_flags = 0; - - /* bring resources that changed context up-to-date */ - swr_invalidate_buffers_after_ctx_change(pipe); - - /* Render Targets */ - if (ctx->dirty & SWR_NEW_FRAMEBUFFER) { - struct pipe_framebuffer_state *fb = &ctx->framebuffer; - const struct util_format_description *desc = NULL; - bool need_fence = false; - - /* colorbuffer targets */ - if (fb->nr_cbufs) { - for (unsigned i = 0; i < fb->nr_cbufs; ++i) - need_fence |= swr_change_rt( - ctx, SWR_ATTACHMENT_COLOR0 + i, fb->cbufs[i]); - } - for (unsigned i = fb->nr_cbufs; i < SWR_NUM_RENDERTARGETS; ++i) - need_fence |= swr_change_rt(ctx, SWR_ATTACHMENT_COLOR0 + i, NULL); - - /* depth/stencil target */ - if (fb->zsbuf) - desc = util_format_description(fb->zsbuf->format); - if (fb->zsbuf && util_format_has_depth(desc)) - need_fence |= swr_change_rt(ctx, SWR_ATTACHMENT_DEPTH, fb->zsbuf); - else - need_fence |= swr_change_rt(ctx, SWR_ATTACHMENT_DEPTH, NULL); - - if (fb->zsbuf && util_format_has_stencil(desc)) - need_fence |= swr_change_rt(ctx, SWR_ATTACHMENT_STENCIL, fb->zsbuf); - else - need_fence |= swr_change_rt(ctx, SWR_ATTACHMENT_STENCIL, NULL); - - /* This fence ensures any attachment changes are resolved before the - * next draw */ - if (need_fence) - swr_fence_submit(ctx, screen->flush_fence); - } - - /* Raster state */ - if (ctx->dirty & (SWR_NEW_RASTERIZER | - SWR_NEW_VS | // clipping - SWR_NEW_TES | - SWR_NEW_TCS | - SWR_NEW_FRAMEBUFFER)) { - pipe_rasterizer_state *rasterizer = ctx->rasterizer; - pipe_framebuffer_state *fb = &ctx->framebuffer; - - SWR_RASTSTATE *rastState = &ctx->derived.rastState; - rastState->cullMode = swr_convert_cull_mode(rasterizer->cull_face); - rastState->frontWinding = rasterizer->front_ccw - ? SWR_FRONTWINDING_CCW - : SWR_FRONTWINDING_CW; - rastState->scissorEnable = rasterizer->scissor; - rastState->pointSize = rasterizer->point_size > 0.0f - ? rasterizer->point_size - : 1.0f; - rastState->lineWidth = rasterizer->line_width > 0.0f - ? rasterizer->line_width - : 1.0f; - - rastState->pointParam = rasterizer->point_size_per_vertex; - - rastState->pointSpriteEnable = rasterizer->sprite_coord_enable; - rastState->pointSpriteTopOrigin = - rasterizer->sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT; - - /* If SWR_MSAA_FORCE_ENABLE is set, turn msaa on */ - if (screen->msaa_force_enable && !rasterizer->multisample) { - /* Force enable and use the value the surface was created with */ - rasterizer->multisample = true; - fb->samples = swr_resource(fb->cbufs[0]->texture)->swr.numSamples; - fprintf(stderr,"msaa force enable: %d samples\n", fb->samples); - } - - rastState->sampleCount = GetSampleCount(fb->samples); - rastState->forcedSampleCount = false; - rastState->bIsCenterPattern = !rasterizer->multisample; - rastState->pixelLocation = SWR_PIXEL_LOCATION_CENTER; - - /* Only initialize sample positions if msaa is enabled */ - if (rasterizer->multisample) { - for (uint32_t i = 0; i < fb->samples; i++) { - const uint8_t *sample = swr_sample_positions[fb->samples-1 + i]; - rastState->samplePositions.SetXi(i, sample[0] << 4); - rastState->samplePositions.SetYi(i, sample[1] << 4); - rastState->samplePositions.SetX (i, sample[0] / 16.0f); - rastState->samplePositions.SetY (i, sample[1] / 16.0f); - } - rastState->samplePositions.PrecalcSampleData(fb->samples); - } - - bool do_offset = false; - switch (rasterizer->fill_front) { - case PIPE_POLYGON_MODE_FILL: - do_offset = rasterizer->offset_tri; - break; - case PIPE_POLYGON_MODE_LINE: - do_offset = rasterizer->offset_line; - break; - case PIPE_POLYGON_MODE_POINT: - do_offset = rasterizer->offset_point; - break; - } - - if (do_offset) { - rastState->depthBias = rasterizer->offset_units; - rastState->slopeScaledDepthBias = rasterizer->offset_scale; - rastState->depthBiasClamp = rasterizer->offset_clamp; - } else { - rastState->depthBias = 0; - rastState->slopeScaledDepthBias = 0; - rastState->depthBiasClamp = 0; - } - - /* translate polygon mode, at least for the front==back case */ - rastState->fillMode = swr_convert_fill_mode(rasterizer->fill_front); - - struct pipe_surface *zb = fb->zsbuf; - if (zb && swr_resource(zb->texture)->has_depth) - rastState->depthFormat = swr_resource(zb->texture)->swr.format; - - rastState->depthClipEnable = rasterizer->depth_clip_near; - rastState->clipEnable = rasterizer->depth_clip_near | rasterizer->depth_clip_far; - rastState->clipHalfZ = rasterizer->clip_halfz; - - ctx->api.pfnSwrSetRastState(ctx->swrContext, rastState); - } - - /* Viewport */ - if (ctx->dirty & (SWR_NEW_VIEWPORT | SWR_NEW_FRAMEBUFFER - | SWR_NEW_RASTERIZER)) { - pipe_viewport_state *state = &ctx->viewports[0]; - pipe_framebuffer_state *fb = &ctx->framebuffer; - pipe_rasterizer_state *rasterizer = ctx->rasterizer; - - SWR_VIEWPORT *vp = &ctx->derived.vp[0]; - SWR_VIEWPORT_MATRICES *vpm = &ctx->derived.vpm; - - for (unsigned i = 0; i < KNOB_NUM_VIEWPORTS_SCISSORS; i++) { - vp->x = state->translate[0] - state->scale[0]; - vp->width = 2 * state->scale[0]; - vp->y = state->translate[1] - fabs(state->scale[1]); - vp->height = 2 * fabs(state->scale[1]); - util_viewport_zmin_zmax(state, rasterizer->clip_halfz, - &vp->minZ, &vp->maxZ); - - if (rasterizer->depth_clip_near) { - vp->minZ = 0.0f; - } - - if (rasterizer->depth_clip_far) { - vp->maxZ = 1.0f; - } - - vpm->m00[i] = state->scale[0]; - vpm->m11[i] = state->scale[1]; - vpm->m22[i] = state->scale[2]; - vpm->m30[i] = state->translate[0]; - vpm->m31[i] = state->translate[1]; - vpm->m32[i] = state->translate[2]; - - /* Now that the matrix is calculated, clip the view coords to screen - * size. OpenGL allows for -ve x,y in the viewport. */ - if (vp->x < 0.0f) { - vp->width += vp->x; - vp->x = 0.0f; - } - if (vp->y < 0.0f) { - vp->height += vp->y; - vp->y = 0.0f; - } - vp->width = std::min(vp->width, (float) fb->width - vp->x); - vp->height = std::min(vp->height, (float) fb->height - vp->y); - - vp++; - state++; - } - ctx->api.pfnSwrSetViewports(ctx->swrContext, KNOB_NUM_VIEWPORTS_SCISSORS, - &ctx->derived.vp[0], &ctx->derived.vpm); - } - - /* When called from swr_clear (p_draw_info = null), render targets, - * rasterState and viewports (dependent on render targets) are the only - * necessary validation. Defer remaining validation by setting - * post_update_dirty_flags and clear all dirty flags. BackendState is - * still unconditionally validated below */ - if (!p_draw_info) { - post_update_dirty_flags = ctx->dirty & ~(SWR_NEW_FRAMEBUFFER | - SWR_NEW_RASTERIZER | - SWR_NEW_VIEWPORT); - ctx->dirty = 0; - } - - /* Scissor */ - if (ctx->dirty & SWR_NEW_SCISSOR) { - ctx->api.pfnSwrSetScissorRects(ctx->swrContext, KNOB_NUM_VIEWPORTS_SCISSORS, ctx->swr_scissors); - } - - /* Set vertex & index buffers */ - if (ctx->dirty & SWR_NEW_VERTEX) { - const struct pipe_draw_info &info = *p_draw_info; - - /* vertex buffers */ - SWR_VERTEX_BUFFER_STATE swrVertexBuffers[PIPE_MAX_ATTRIBS]; - for (UINT i = 0; i < ctx->num_vertex_buffers; i++) { - uint32_t size = 0, pitch = 0, elems = 0, partial_inbounds = 0; - uint32_t min_vertex_index = 0; - const uint8_t *p_data; - struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i]; - - pitch = vb->stride; - if (vb->is_user_buffer) { - /* Client buffer - * client memory is one-time use, re-trigger SWR_NEW_VERTEX to - * revalidate on each draw */ - post_update_dirty_flags |= SWR_NEW_VERTEX; - - uint32_t base; - swr_user_vbuf_range(&info, ctx->velems, vb, i, &elems, &base, &size, draw->index_bias); - partial_inbounds = 0; - min_vertex_index = info.min_index + (info.index_size ? draw->index_bias : 0); - - size = AlignUp(size, 4); - /* If size of client memory copy is too large, don't copy. The - * draw will access user-buffer directly and then block. This is - * faster than queuing many large client draws. */ - if (size >= screen->client_copy_limit) { - post_update_dirty_flags |= SWR_BLOCK_CLIENT_DRAW; - p_data = (const uint8_t *) vb->buffer.user; - } else { - /* Copy only needed vertices to scratch space */ - const void *ptr = (const uint8_t *) vb->buffer.user + base; - ptr = (uint8_t *)swr_copy_to_scratch_space( - ctx, &ctx->scratch->vertex_buffer, ptr, size); - p_data = (const uint8_t *)ptr - base; - } - } else if (vb->buffer.resource) { - /* VBO */ - if (!pitch) { - /* If pitch=0 (ie vb->stride), buffer contains a single - * constant attribute. Use the stream_pitch which was - * calculated during creation of vertex_elements_state for the - * size of the attribute. */ - size = ctx->velems->stream_pitch[i]; - elems = 1; - partial_inbounds = 0; - min_vertex_index = 0; - } else { - /* size is based on buffer->width0 rather than info.max_index - * to prevent having to validate VBO on each draw. */ - size = vb->buffer.resource->width0; - elems = size / pitch; - partial_inbounds = size % pitch; - min_vertex_index = 0; - } - - p_data = swr_resource_data(vb->buffer.resource) + vb->buffer_offset; - } else - p_data = NULL; - - swrVertexBuffers[i] = {0}; - swrVertexBuffers[i].index = i; - swrVertexBuffers[i].pitch = pitch; - swrVertexBuffers[i].xpData = (gfxptr_t) p_data; - swrVertexBuffers[i].size = size; - swrVertexBuffers[i].minVertex = min_vertex_index; - swrVertexBuffers[i].maxVertex = elems; - swrVertexBuffers[i].partialInboundsSize = partial_inbounds; - } - - ctx->api.pfnSwrSetVertexBuffers( - ctx->swrContext, ctx->num_vertex_buffers, swrVertexBuffers); - - /* index buffer, if required (info passed in by swr_draw_vbo) */ - SWR_FORMAT index_type = R32_UINT; /* Default for non-indexed draws */ - if (info.index_size) { - const uint8_t *p_data; - uint32_t size, pitch; - - pitch = info.index_size ? info.index_size : sizeof(uint32_t); - index_type = swr_convert_index_type(pitch); - - if (!info.has_user_indices) { - /* VBO - * size is based on buffer->width0 rather than info.count - * to prevent having to validate VBO on each draw */ - size = info.index.resource->width0; - p_data = swr_resource_data(info.index.resource); - } else { - /* Client buffer - * client memory is one-time use, re-trigger SWR_NEW_VERTEX to - * revalidate on each draw */ - post_update_dirty_flags |= SWR_NEW_VERTEX; - - size = draw->count * pitch; - - size = AlignUp(size, 4); - /* If size of client memory copy is too large, don't copy. The - * draw will access user-buffer directly and then block. This is - * faster than queuing many large client draws. */ - if (size >= screen->client_copy_limit) { - post_update_dirty_flags |= SWR_BLOCK_CLIENT_DRAW; - p_data = (const uint8_t *) info.index.user + - draw->start * info.index_size; - } else { - /* Copy indices to scratch space */ - const void *ptr = (char*)info.index.user + - draw->start * info.index_size; - ptr = swr_copy_to_scratch_space( - ctx, &ctx->scratch->index_buffer, ptr, size); - p_data = (const uint8_t *)ptr; - } - } - - SWR_INDEX_BUFFER_STATE swrIndexBuffer; - swrIndexBuffer.format = swr_convert_index_type(info.index_size); - swrIndexBuffer.xpIndices = (gfxptr_t) p_data; - swrIndexBuffer.size = size; - - ctx->api.pfnSwrSetIndexBuffer(ctx->swrContext, &swrIndexBuffer); - } - - struct swr_vertex_element_state *velems = ctx->velems; - if (velems && velems->fsState.indexType != index_type) { - velems->fsFunc = NULL; - velems->fsState.indexType = index_type; - } - } - - /* GeometryShader */ - if (ctx->dirty & (SWR_NEW_GS | - SWR_NEW_VS | - SWR_NEW_TCS | - SWR_NEW_TES | - SWR_NEW_SAMPLER | - SWR_NEW_SAMPLER_VIEW)) { - if (ctx->gs) { - swr_jit_gs_key key; - swr_generate_gs_key(key, ctx, ctx->gs); - auto search = ctx->gs->map.find(key); - PFN_GS_FUNC func; - if (search != ctx->gs->map.end()) { - func = search->second->shader; - } else { - func = swr_compile_gs(ctx, key); - } - ctx->api.pfnSwrSetGsFunc(ctx->swrContext, func); - - /* JIT sampler state */ - if (ctx->dirty & SWR_NEW_SAMPLER) { - swr_update_sampler_state(ctx, - PIPE_SHADER_GEOMETRY, - key.nr_samplers, - ctx->swrDC.samplersGS); - } - - /* JIT sampler view state */ - if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { - swr_update_texture_state(ctx, - PIPE_SHADER_GEOMETRY, - key.nr_sampler_views, - ctx->swrDC.texturesGS); - } - - ctx->api.pfnSwrSetGsState(ctx->swrContext, &ctx->gs->gsState); - } else { - SWR_GS_STATE state = { 0 }; - ctx->api.pfnSwrSetGsState(ctx->swrContext, &state); - ctx->api.pfnSwrSetGsFunc(ctx->swrContext, NULL); - } - } - - // We may need to restore tessellation state - // This restored state may be however overwritten - // during shader compilation - if (ctx->dirty & SWR_NEW_TS) { - if (ctx->tes != nullptr) { - ctx->tsState = ctx->tes->ts_state; - ctx->api.pfnSwrSetTsState(ctx->swrContext, &ctx->tsState); - } else { - SWR_TS_STATE state = { 0 }; - ctx->api.pfnSwrSetTsState(ctx->swrContext, &state); - } - } - - // Tessellation Evaluation Shader - // Compile TES first, because TCS is optional - if (ctx->dirty & (SWR_NEW_GS | - SWR_NEW_VS | - SWR_NEW_TCS | - SWR_NEW_TES | - SWR_NEW_SAMPLER | - SWR_NEW_SAMPLER_VIEW)) { - if (ctx->tes) { - swr_jit_tes_key key; - swr_generate_tes_key(key, ctx, ctx->tes); - - auto search = ctx->tes->map.find(key); - PFN_TES_FUNC func; - if (search != ctx->tes->map.end()) { - func = search->second->shader; - } else { - func = swr_compile_tes(ctx, key); - } - - ctx->api.pfnSwrSetDsFunc(ctx->swrContext, func); - - /* JIT sampler state */ - if (ctx->dirty & SWR_NEW_SAMPLER) { - swr_update_sampler_state(ctx, - PIPE_SHADER_TESS_EVAL, - key.nr_samplers, - ctx->swrDC.samplersTES); - } - - /* JIT sampler view state */ - if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { - swr_update_texture_state(ctx, - PIPE_SHADER_TESS_EVAL, - key.nr_sampler_views, - ctx->swrDC.texturesTES); - } - - // Update tessellation state in case it's been updated - ctx->api.pfnSwrSetTsState(ctx->swrContext, &ctx->tsState); - } else { - ctx->api.pfnSwrSetDsFunc(ctx->swrContext, NULL); - } - } - - /* Tessellation Control Shader */ - if (ctx->dirty & (SWR_NEW_GS | - SWR_NEW_VS | - SWR_NEW_TCS | - SWR_NEW_TES | - SWR_NEW_SAMPLER | - SWR_NEW_SAMPLER_VIEW)) { - if (ctx->tcs) { - ctx->tcs->vertices_per_patch = ctx->patch_vertices; - - swr_jit_tcs_key key; - swr_generate_tcs_key(key, ctx, ctx->tcs); - - auto search = ctx->tcs->map.find(key); - PFN_TCS_FUNC func; - if (search != ctx->tcs->map.end()) { - func = search->second->shader; - } else { - func = swr_compile_tcs(ctx, key); - } - - ctx->api.pfnSwrSetHsFunc(ctx->swrContext, func); - - /* JIT sampler state */ - if (ctx->dirty & SWR_NEW_SAMPLER) { - swr_update_sampler_state(ctx, - PIPE_SHADER_TESS_CTRL, - key.nr_samplers, - ctx->swrDC.samplersTCS); - } - - /* JIT sampler view state */ - if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { - swr_update_texture_state(ctx, - PIPE_SHADER_TESS_CTRL, - key.nr_sampler_views, - ctx->swrDC.texturesTCS); - } - - // Update tessellation state in case it's been updated - ctx->api.pfnSwrSetTsState(ctx->swrContext, &ctx->tsState); - } else { - ctx->api.pfnSwrSetHsFunc(ctx->swrContext, NULL); - } - } - - /* VertexShader */ - if (ctx->dirty - & (SWR_NEW_VS | SWR_NEW_RASTERIZER | // for clip planes - SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { - swr_jit_vs_key key; - swr_generate_vs_key(key, ctx, ctx->vs); - auto search = ctx->vs->map.find(key); - PFN_VERTEX_FUNC func; - if (search != ctx->vs->map.end()) { - func = search->second->shader; - } else { - func = swr_compile_vs(ctx, key); - } - ctx->api.pfnSwrSetVertexFunc(ctx->swrContext, func); - - /* JIT sampler state */ - if (ctx->dirty & SWR_NEW_SAMPLER) { - swr_update_sampler_state( - ctx, PIPE_SHADER_VERTEX, key.nr_samplers, ctx->swrDC.samplersVS); - } - - /* JIT sampler view state */ - if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { - swr_update_texture_state(ctx, - PIPE_SHADER_VERTEX, - key.nr_sampler_views, - ctx->swrDC.texturesVS); - } - } - - /* work around the fact that poly stipple also affects lines */ - /* and points, since we rasterize them as triangles, too */ - /* Has to be before fragment shader, since it sets SWR_NEW_FS */ - if (p_draw_info) { - bool new_prim_is_poly = - (u_reduced_prim(p_draw_info->mode) == PIPE_PRIM_TRIANGLES) && - (ctx->derived.rastState.fillMode == SWR_FILLMODE_SOLID); - if (new_prim_is_poly != ctx->poly_stipple.prim_is_poly) { - ctx->dirty |= SWR_NEW_FS; - ctx->poly_stipple.prim_is_poly = new_prim_is_poly; - } - } - - /* FragmentShader */ - if (ctx->dirty & (SWR_NEW_FS | - SWR_NEW_VS | - SWR_NEW_GS | - SWR_NEW_TES | - SWR_NEW_TCS | - SWR_NEW_RASTERIZER | - SWR_NEW_SAMPLER | - SWR_NEW_SAMPLER_VIEW | - SWR_NEW_FRAMEBUFFER)) { - swr_jit_fs_key key; - swr_generate_fs_key(key, ctx, ctx->fs); - auto search = ctx->fs->map.find(key); - PFN_PIXEL_KERNEL func; - if (search != ctx->fs->map.end()) { - func = search->second->shader; - } else { - func = swr_compile_fs(ctx, key); - } - SWR_PS_STATE psState = {0}; - psState.pfnPixelShader = func; - psState.killsPixel = ctx->fs->info.base.uses_kill; - psState.inputCoverage = SWR_INPUT_COVERAGE_NORMAL; - psState.writesODepth = ctx->fs->info.base.writes_z; - psState.usesSourceDepth = ctx->fs->info.base.reads_z; - psState.shadingRate = SWR_SHADING_RATE_PIXEL; - psState.renderTargetMask = (1 << ctx->framebuffer.nr_cbufs) - 1; - psState.posOffset = SWR_PS_POSITION_SAMPLE_NONE; - uint32_t barycentricsMask = 0; -#if 0 - // when we switch to mesa-master - if (ctx->fs->info.base.uses_persp_center || - ctx->fs->info.base.uses_linear_center) - barycentricsMask |= SWR_BARYCENTRIC_PER_PIXEL_MASK; - if (ctx->fs->info.base.uses_persp_centroid || - ctx->fs->info.base.uses_linear_centroid) - barycentricsMask |= SWR_BARYCENTRIC_CENTROID_MASK; - if (ctx->fs->info.base.uses_persp_sample || - ctx->fs->info.base.uses_linear_sample) - barycentricsMask |= SWR_BARYCENTRIC_PER_SAMPLE_MASK; -#else - for (unsigned i = 0; i < ctx->fs->info.base.num_inputs; i++) { - switch (ctx->fs->info.base.input_interpolate_loc[i]) { - case TGSI_INTERPOLATE_LOC_CENTER: - barycentricsMask |= SWR_BARYCENTRIC_PER_PIXEL_MASK; - break; - case TGSI_INTERPOLATE_LOC_CENTROID: - barycentricsMask |= SWR_BARYCENTRIC_CENTROID_MASK; - break; - case TGSI_INTERPOLATE_LOC_SAMPLE: - barycentricsMask |= SWR_BARYCENTRIC_PER_SAMPLE_MASK; - break; - } - } -#endif - psState.barycentricsMask = barycentricsMask; - psState.usesUAV = false; // XXX - psState.forceEarlyZ = false; - ctx->api.pfnSwrSetPixelShaderState(ctx->swrContext, &psState); - - /* JIT sampler state */ - if (ctx->dirty & (SWR_NEW_SAMPLER | - SWR_NEW_FS)) { - swr_update_sampler_state(ctx, - PIPE_SHADER_FRAGMENT, - key.nr_samplers, - ctx->swrDC.samplersFS); - } - - /* JIT sampler view state */ - if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | - SWR_NEW_FRAMEBUFFER | - SWR_NEW_FS)) { - swr_update_texture_state(ctx, - PIPE_SHADER_FRAGMENT, - key.nr_sampler_views, - ctx->swrDC.texturesFS); - } - } - - - /* VertexShader Constants */ - if (ctx->dirty & SWR_NEW_VSCONSTANTS) { - swr_update_constants(ctx, PIPE_SHADER_VERTEX); - } - - /* FragmentShader Constants */ - if (ctx->dirty & SWR_NEW_FSCONSTANTS) { - swr_update_constants(ctx, PIPE_SHADER_FRAGMENT); - } - - /* GeometryShader Constants */ - if (ctx->dirty & SWR_NEW_GSCONSTANTS) { - swr_update_constants(ctx, PIPE_SHADER_GEOMETRY); - } - - /* Tessellation Control Shader Constants */ - if (ctx->dirty & SWR_NEW_TCSCONSTANTS) { - swr_update_constants(ctx, PIPE_SHADER_TESS_CTRL); - } - - /* Tessellation Evaluation Shader Constants */ - if (ctx->dirty & SWR_NEW_TESCONSTANTS) { - swr_update_constants(ctx, PIPE_SHADER_TESS_EVAL); - } - - /* Depth/stencil state */ - if (ctx->dirty & (SWR_NEW_DEPTH_STENCIL_ALPHA | SWR_NEW_FRAMEBUFFER)) { - struct pipe_depth_stencil_alpha_state *depth = ctx->depth_stencil; - struct pipe_stencil_state *stencil = depth->stencil; - SWR_DEPTH_STENCIL_STATE depthStencilState = {{0}}; - SWR_DEPTH_BOUNDS_STATE depthBoundsState = {0}; - - /* XXX, incomplete. Need to flesh out stencil & alpha test state - struct pipe_stencil_state *front_stencil = - ctx->depth_stencil.stencil[0]; - struct pipe_stencil_state *back_stencil = ctx->depth_stencil.stencil[1]; - */ - if (stencil[0].enabled) { - depthStencilState.stencilWriteEnable = 1; - depthStencilState.stencilTestEnable = 1; - depthStencilState.stencilTestFunc = - swr_convert_depth_func(stencil[0].func); - - depthStencilState.stencilPassDepthPassOp = - swr_convert_stencil_op(stencil[0].zpass_op); - depthStencilState.stencilPassDepthFailOp = - swr_convert_stencil_op(stencil[0].zfail_op); - depthStencilState.stencilFailOp = - swr_convert_stencil_op(stencil[0].fail_op); - depthStencilState.stencilWriteMask = stencil[0].writemask; - depthStencilState.stencilTestMask = stencil[0].valuemask; - depthStencilState.stencilRefValue = ctx->stencil_ref.ref_value[0]; - } - if (stencil[1].enabled) { - depthStencilState.doubleSidedStencilTestEnable = 1; - - depthStencilState.backfaceStencilTestFunc = - swr_convert_depth_func(stencil[1].func); - - depthStencilState.backfaceStencilPassDepthPassOp = - swr_convert_stencil_op(stencil[1].zpass_op); - depthStencilState.backfaceStencilPassDepthFailOp = - swr_convert_stencil_op(stencil[1].zfail_op); - depthStencilState.backfaceStencilFailOp = - swr_convert_stencil_op(stencil[1].fail_op); - depthStencilState.backfaceStencilWriteMask = stencil[1].writemask; - depthStencilState.backfaceStencilTestMask = stencil[1].valuemask; - - depthStencilState.backfaceStencilRefValue = - ctx->stencil_ref.ref_value[1]; - } - - depthStencilState.depthTestEnable = depth->depth_enabled; - depthStencilState.depthTestFunc = swr_convert_depth_func(depth->depth_func); - depthStencilState.depthWriteEnable = depth->depth_writemask; - ctx->api.pfnSwrSetDepthStencilState(ctx->swrContext, &depthStencilState); - - depthBoundsState.depthBoundsTestEnable = depth->depth_bounds_test; - depthBoundsState.depthBoundsTestMinValue = depth->depth_bounds_min; - depthBoundsState.depthBoundsTestMaxValue = depth->depth_bounds_max; - ctx->api.pfnSwrSetDepthBoundsState(ctx->swrContext, &depthBoundsState); - } - - /* Blend State */ - if (ctx->dirty & (SWR_NEW_BLEND | - SWR_NEW_RASTERIZER | - SWR_NEW_FRAMEBUFFER | - SWR_NEW_DEPTH_STENCIL_ALPHA)) { - struct pipe_framebuffer_state *fb = &ctx->framebuffer; - - SWR_BLEND_STATE blendState; - memcpy(&blendState, &ctx->blend->blendState, sizeof(blendState)); - blendState.constantColor[0] = ctx->blend_color.color[0]; - blendState.constantColor[1] = ctx->blend_color.color[1]; - blendState.constantColor[2] = ctx->blend_color.color[2]; - blendState.constantColor[3] = ctx->blend_color.color[3]; - blendState.alphaTestReference = - *((uint32_t*)&ctx->depth_stencil->alpha_ref_value); - - blendState.sampleMask = ctx->sample_mask; - blendState.sampleCount = GetSampleCount(fb->samples); - - /* If there are no color buffers bound, disable writes on RT0 - * and skip loop */ - if (fb->nr_cbufs == 0) { - blendState.renderTarget[0].writeDisableRed = 1; - blendState.renderTarget[0].writeDisableGreen = 1; - blendState.renderTarget[0].writeDisableBlue = 1; - blendState.renderTarget[0].writeDisableAlpha = 1; - ctx->api.pfnSwrSetBlendFunc(ctx->swrContext, 0, NULL); - } - else - for (int target = 0; - target < std::min(SWR_NUM_RENDERTARGETS, - PIPE_MAX_COLOR_BUFS); - target++) { - if (!fb->cbufs[target]) - continue; - - struct swr_resource *colorBuffer = - swr_resource(fb->cbufs[target]->texture); - - BLEND_COMPILE_STATE compileState; - memset(&compileState, 0, sizeof(compileState)); - compileState.format = colorBuffer->swr.format; - memcpy(&compileState.blendState, - &ctx->blend->compileState[target], - sizeof(compileState.blendState)); - - const SWR_FORMAT_INFO& info = GetFormatInfo(compileState.format); - if (compileState.blendState.logicOpEnable && - ((info.type[0] == SWR_TYPE_FLOAT) || info.isSRGB)) { - compileState.blendState.logicOpEnable = false; - } - - if (info.type[0] == SWR_TYPE_SINT || info.type[0] == SWR_TYPE_UINT) - compileState.blendState.blendEnable = false; - - if (compileState.blendState.blendEnable == false && - compileState.blendState.logicOpEnable == false && - ctx->depth_stencil->alpha_enabled == 0) { - ctx->api.pfnSwrSetBlendFunc(ctx->swrContext, target, NULL); - continue; - } - - compileState.desc.alphaTestEnable = - ctx->depth_stencil->alpha_enabled; - compileState.desc.independentAlphaBlendEnable = - (compileState.blendState.sourceBlendFactor != - compileState.blendState.sourceAlphaBlendFactor) || - (compileState.blendState.destBlendFactor != - compileState.blendState.destAlphaBlendFactor) || - (compileState.blendState.colorBlendFunc != - compileState.blendState.alphaBlendFunc); - compileState.desc.alphaToCoverageEnable = - ctx->blend->pipe.alpha_to_coverage; - compileState.desc.sampleMaskEnable = (blendState.sampleMask != 0); - compileState.desc.numSamples = fb->samples; - - compileState.alphaTestFunction = - swr_convert_depth_func(ctx->depth_stencil->alpha_func); - compileState.alphaTestFormat = ALPHA_TEST_FLOAT32; // xxx - - compileState.Canonicalize(); - - PFN_BLEND_JIT_FUNC func = NULL; - auto search = ctx->blendJIT->find(compileState); - if (search != ctx->blendJIT->end()) { - func = search->second; - } else { - HANDLE hJitMgr = screen->hJitMgr; - func = JitCompileBlend(hJitMgr, compileState); - debug_printf("BLEND shader %p\n", func); - assert(func && "Error: BlendShader = NULL"); - - ctx->blendJIT->insert(std::make_pair(compileState, func)); - } - ctx->api.pfnSwrSetBlendFunc(ctx->swrContext, target, func); - } - - ctx->api.pfnSwrSetBlendState(ctx->swrContext, &blendState); - } - - if (ctx->dirty & SWR_NEW_STIPPLE) { - swr_update_poly_stipple(ctx); - } - - if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_TCS | SWR_NEW_TES | SWR_NEW_SO | SWR_NEW_RASTERIZER)) { - ctx->vs->soState.rasterizerDisable = - ctx->rasterizer->rasterizer_discard; - ctx->api.pfnSwrSetSoState(ctx->swrContext, &ctx->vs->soState); - - pipe_stream_output_info *stream_output = &ctx->vs->pipe.stream_output; - - for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) { - SWR_STREAMOUT_BUFFER buffer = {0}; - if (ctx->so_targets[i]) { - buffer.enable = true; - buffer.pBuffer = - (gfxptr_t)(swr_resource_data(ctx->so_targets[i]->buffer) + - ctx->so_targets[i]->buffer_offset); - buffer.bufferSize = ctx->so_targets[i]->buffer_size >> 2; - buffer.pitch = stream_output->stride[i]; - buffer.streamOffset = 0; - } - - ctx->api.pfnSwrSetSoBuffers(ctx->swrContext, &buffer, i); - } - } - - - if (ctx->dirty & (SWR_NEW_CLIP | SWR_NEW_RASTERIZER | SWR_NEW_VS)) { - // shader exporting clip distances overrides all user clip planes - if (ctx->rasterizer->clip_plane_enable && - !swr_get_last_fe(ctx)->num_written_clipdistance) - { - swr_draw_context *pDC = &ctx->swrDC; - memcpy(pDC->userClipPlanes, - ctx->clip.ucp, - sizeof(pDC->userClipPlanes)); - } - } - - // set up backend state - SWR_BACKEND_STATE backendState = {0}; - if (ctx->gs) { - backendState.numAttributes = ctx->gs->info.base.num_outputs - 1; - } else - if (ctx->tes) { - backendState.numAttributes = ctx->tes->info.base.num_outputs - 1; - // no case for TCS, because if TCS is active, TES must be active - // as well - pipeline stages after tessellation does not support patches - } else { - backendState.numAttributes = ctx->vs->info.base.num_outputs - 1; - if (ctx->fs->info.base.uses_primid) { - backendState.numAttributes++; - backendState.swizzleEnable = true; - for (unsigned i = 0; i < sizeof(backendState.numComponents); i++) { - backendState.swizzleMap[i].sourceAttrib = i; - } - backendState.swizzleMap[ctx->vs->info.base.num_outputs - 1].constantSource = - SWR_CONSTANT_SOURCE_PRIM_ID; - backendState.swizzleMap[ctx->vs->info.base.num_outputs - 1].componentOverrideMask = 1; - } - } - if (ctx->rasterizer->sprite_coord_enable) - backendState.numAttributes++; - - backendState.numAttributes = std::min((size_t)backendState.numAttributes, - sizeof(backendState.numComponents)); - for (unsigned i = 0; i < backendState.numAttributes; i++) - backendState.numComponents[i] = 4; - backendState.constantInterpolationMask = ctx->fs->constantMask | - (ctx->rasterizer->flatshade ? ctx->fs->flatConstantMask : 0); - backendState.pointSpriteTexCoordMask = ctx->fs->pointSpriteMask; - - struct tgsi_shader_info *pLastFE = swr_get_last_fe(ctx); - - backendState.readRenderTargetArrayIndex = pLastFE->writes_layer; - backendState.readViewportArrayIndex = pLastFE->writes_viewport_index; - backendState.vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize - - backendState.clipDistanceMask = - pLastFE->num_written_clipdistance ? - pLastFE->clipdist_writemask & ctx->rasterizer->clip_plane_enable : - ctx->rasterizer->clip_plane_enable; - - backendState.cullDistanceMask = - pLastFE->culldist_writemask << pLastFE->num_written_clipdistance; - - // Assume old layout of SGV, POSITION, CLIPCULL, ATTRIB - backendState.vertexClipCullOffset = backendState.vertexAttribOffset - 2; - - ctx->api.pfnSwrSetBackendState(ctx->swrContext, &backendState); - - /* Ensure that any in-progress attachment change StoreTiles finish */ - if (swr_is_fence_pending(screen->flush_fence)) - swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0); - - /* Finally, update the in-use status of all resources involved in draw */ - swr_update_resource_status(pipe, p_draw_info); - - ctx->dirty = post_update_dirty_flags; -} - - -static struct pipe_stream_output_target * -swr_create_so_target(struct pipe_context *pipe, - struct pipe_resource *buffer, - unsigned buffer_offset, - unsigned buffer_size) -{ - struct pipe_stream_output_target *target; - - target = CALLOC_STRUCT(pipe_stream_output_target); - if (!target) - return NULL; - - target->context = pipe; - target->reference.count = 1; - pipe_resource_reference(&target->buffer, buffer); - target->buffer_offset = buffer_offset; - target->buffer_size = buffer_size; - return target; -} - -static void -swr_destroy_so_target(struct pipe_context *pipe, - struct pipe_stream_output_target *target) -{ - pipe_resource_reference(&target->buffer, NULL); - FREE(target); -} - -static void -swr_set_so_targets(struct pipe_context *pipe, - unsigned num_targets, - struct pipe_stream_output_target **targets, - const unsigned *offsets) -{ - struct swr_context *swr = swr_context(pipe); - uint32_t i; - - assert(num_targets <= MAX_SO_STREAMS); - - for (i = 0; i < num_targets; i++) { - pipe_so_target_reference( - (struct pipe_stream_output_target **)&swr->so_targets[i], - targets[i]); - } - - for (/* fall-through */; i < swr->num_so_targets; i++) { - pipe_so_target_reference( - (struct pipe_stream_output_target **)&swr->so_targets[i], NULL); - } - - swr->num_so_targets = num_targets; - swr->swrDC.soPrims = &swr->so_primCounter; - - swr->dirty |= SWR_NEW_SO; -} - -static void -swr_set_patch_vertices(struct pipe_context *pipe, uint8_t patch_vertices) -{ - struct swr_context *swr = swr_context(pipe); - - swr->patch_vertices = patch_vertices; -} - - -void -swr_state_init(struct pipe_context *pipe) -{ - pipe->create_blend_state = swr_create_blend_state; - pipe->bind_blend_state = swr_bind_blend_state; - pipe->delete_blend_state = swr_delete_blend_state; - - pipe->create_depth_stencil_alpha_state = swr_create_depth_stencil_state; - pipe->bind_depth_stencil_alpha_state = swr_bind_depth_stencil_state; - pipe->delete_depth_stencil_alpha_state = swr_delete_depth_stencil_state; - - pipe->create_rasterizer_state = swr_create_rasterizer_state; - pipe->bind_rasterizer_state = swr_bind_rasterizer_state; - pipe->delete_rasterizer_state = swr_delete_rasterizer_state; - - pipe->create_sampler_state = swr_create_sampler_state; - pipe->bind_sampler_states = swr_bind_sampler_states; - pipe->delete_sampler_state = swr_delete_sampler_state; - - pipe->create_sampler_view = swr_create_sampler_view; - pipe->set_sampler_views = swr_set_sampler_views; - pipe->sampler_view_destroy = swr_sampler_view_destroy; - - pipe->create_vs_state = swr_create_vs_state; - pipe->bind_vs_state = swr_bind_vs_state; - pipe->delete_vs_state = swr_delete_vs_state; - - pipe->create_fs_state = swr_create_fs_state; - pipe->bind_fs_state = swr_bind_fs_state; - pipe->delete_fs_state = swr_delete_fs_state; - - pipe->create_gs_state = swr_create_gs_state; - pipe->bind_gs_state = swr_bind_gs_state; - pipe->delete_gs_state = swr_delete_gs_state; - - pipe->create_tcs_state = swr_create_tcs_state; - pipe->bind_tcs_state = swr_bind_tcs_state; - pipe->delete_tcs_state = swr_delete_tcs_state; - - pipe->create_tes_state = swr_create_tes_state; - pipe->bind_tes_state = swr_bind_tes_state; - pipe->delete_tes_state = swr_delete_tes_state; - - pipe->set_constant_buffer = swr_set_constant_buffer; - - pipe->create_vertex_elements_state = swr_create_vertex_elements_state; - pipe->bind_vertex_elements_state = swr_bind_vertex_elements_state; - pipe->delete_vertex_elements_state = swr_delete_vertex_elements_state; - - pipe->set_vertex_buffers = swr_set_vertex_buffers; - - pipe->set_polygon_stipple = swr_set_polygon_stipple; - pipe->set_clip_state = swr_set_clip_state; - pipe->set_scissor_states = swr_set_scissor_states; - pipe->set_viewport_states = swr_set_viewport_states; - - pipe->set_framebuffer_state = swr_set_framebuffer_state; - - pipe->set_blend_color = swr_set_blend_color; - pipe->set_stencil_ref = swr_set_stencil_ref; - - pipe->set_sample_mask = swr_set_sample_mask; - pipe->get_sample_position = swr_get_sample_position; - - pipe->create_stream_output_target = swr_create_so_target; - pipe->stream_output_target_destroy = swr_destroy_so_target; - pipe->set_stream_output_targets = swr_set_so_targets; - - pipe->set_patch_vertices = swr_set_patch_vertices; -} diff --git a/src/gallium/drivers/swr/swr_state.h b/src/gallium/drivers/swr/swr_state.h deleted file mode 100644 index 75a70de0b1a..00000000000 --- a/src/gallium/drivers/swr/swr_state.h +++ /dev/null @@ -1,426 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - ***************************************************************************/ - -#ifndef SWR_STATE_H -#define SWR_STATE_H - -#include "pipe/p_defines.h" -#include "tgsi/tgsi_scan.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_dump.h" -#include "gallivm/lp_bld_init.h" -#include "gallivm/lp_bld_tgsi.h" -#include "util/crc32.h" -#include "api.h" -#include "swr_tex_sample.h" -#include "swr_shader.h" -#include <unordered_map> -#include <memory> - -template <typename T> -struct ShaderVariant { - struct gallivm_state *gallivm; - T shader; - - ShaderVariant(struct gallivm_state *gs, T code) : gallivm(gs), shader(code) {} - ~ShaderVariant() { gallivm_destroy(gallivm); } -}; - -using PFN_TCS_FUNC = PFN_HS_FUNC; -using PFN_TES_FUNC = PFN_DS_FUNC; - -typedef ShaderVariant<PFN_VERTEX_FUNC> VariantVS; -typedef ShaderVariant<PFN_PIXEL_KERNEL> VariantFS; -typedef ShaderVariant<PFN_GS_FUNC> VariantGS; -typedef ShaderVariant<PFN_TCS_FUNC> VariantTCS; -typedef ShaderVariant<PFN_TES_FUNC> VariantTES; - -/* skeleton */ -struct swr_vertex_shader { - struct pipe_shader_state pipe; - struct lp_tgsi_info info; - std::unordered_map<swr_jit_vs_key, std::unique_ptr<VariantVS>> map; - SWR_STREAMOUT_STATE soState; - PFN_SO_FUNC soFunc[PIPE_PRIM_MAX] {0}; -}; - -struct swr_fragment_shader { - struct pipe_shader_state pipe; - struct lp_tgsi_info info; - uint32_t constantMask; - uint32_t flatConstantMask; - uint32_t pointSpriteMask; - std::unordered_map<swr_jit_fs_key, std::unique_ptr<VariantFS>> map; -}; - -struct swr_geometry_shader { - struct pipe_shader_state pipe; - struct lp_tgsi_info info; - SWR_GS_STATE gsState; - - std::unordered_map<swr_jit_gs_key, std::unique_ptr<VariantGS>> map; -}; - -struct swr_tess_control_shader { - struct pipe_shader_state pipe; - struct lp_tgsi_info info; - uint32_t vertices_per_patch; - - std::unordered_map<swr_jit_tcs_key, std::unique_ptr<VariantTCS>> map; -}; - -struct swr_tess_evaluation_shader { - struct pipe_shader_state pipe; - struct lp_tgsi_info info; - SWR_TS_STATE ts_state; - - std::unordered_map<swr_jit_tes_key, std::unique_ptr<VariantTES>> map; -}; - - -/* Vertex element state */ -struct swr_vertex_element_state { - FETCH_COMPILE_STATE fsState; - PFN_FETCH_FUNC fsFunc {NULL}; - uint32_t stream_pitch[PIPE_MAX_ATTRIBS] {0}; - uint32_t min_instance_div[PIPE_MAX_ATTRIBS] {0}; - uint32_t instanced_bufs {0}; - std::unordered_map<swr_jit_fetch_key, PFN_FETCH_FUNC> map; -}; - -struct swr_blend_state { - struct pipe_blend_state pipe; - SWR_BLEND_STATE blendState; - RENDER_TARGET_BLEND_COMPILE_STATE compileState[PIPE_MAX_COLOR_BUFS]; -}; - -struct swr_poly_stipple { - struct pipe_poly_stipple pipe; - bool prim_is_poly; -}; - -/* - * Derived SWR API DrawState - * For convenience of making simple changes without re-deriving state. - */ -struct swr_derived_state { - SWR_RASTSTATE rastState; - SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS]; - SWR_VIEWPORT_MATRICES vpm; -}; - -void swr_update_derived(struct pipe_context *, - const struct pipe_draw_info * = nullptr, - const struct pipe_draw_start_count_bias *draw = nullptr); - -/* - * Conversion functions: Convert mesa state defines to SWR. - */ - -static INLINE SWR_LOGIC_OP -swr_convert_logic_op(const UINT op) -{ - switch (op) { - case PIPE_LOGICOP_CLEAR: - return LOGICOP_CLEAR; - case PIPE_LOGICOP_NOR: - return LOGICOP_NOR; - case PIPE_LOGICOP_AND_INVERTED: - return LOGICOP_AND_INVERTED; - case PIPE_LOGICOP_COPY_INVERTED: - return LOGICOP_COPY_INVERTED; - case PIPE_LOGICOP_AND_REVERSE: - return LOGICOP_AND_REVERSE; - case PIPE_LOGICOP_INVERT: - return LOGICOP_INVERT; - case PIPE_LOGICOP_XOR: - return LOGICOP_XOR; - case PIPE_LOGICOP_NAND: - return LOGICOP_NAND; - case PIPE_LOGICOP_AND: - return LOGICOP_AND; - case PIPE_LOGICOP_EQUIV: - return LOGICOP_EQUIV; - case PIPE_LOGICOP_NOOP: - return LOGICOP_NOOP; - case PIPE_LOGICOP_OR_INVERTED: - return LOGICOP_OR_INVERTED; - case PIPE_LOGICOP_COPY: - return LOGICOP_COPY; - case PIPE_LOGICOP_OR_REVERSE: - return LOGICOP_OR_REVERSE; - case PIPE_LOGICOP_OR: - return LOGICOP_OR; - case PIPE_LOGICOP_SET: - return LOGICOP_SET; - default: - assert(0 && "Unsupported logic op"); - return LOGICOP_NOOP; - } -} - -static INLINE SWR_STENCILOP -swr_convert_stencil_op(const UINT op) -{ - switch (op) { - case PIPE_STENCIL_OP_KEEP: - return STENCILOP_KEEP; - case PIPE_STENCIL_OP_ZERO: - return STENCILOP_ZERO; - case PIPE_STENCIL_OP_REPLACE: - return STENCILOP_REPLACE; - case PIPE_STENCIL_OP_INCR: - return STENCILOP_INCRSAT; - case PIPE_STENCIL_OP_DECR: - return STENCILOP_DECRSAT; - case PIPE_STENCIL_OP_INCR_WRAP: - return STENCILOP_INCR; - case PIPE_STENCIL_OP_DECR_WRAP: - return STENCILOP_DECR; - case PIPE_STENCIL_OP_INVERT: - return STENCILOP_INVERT; - default: - assert(0 && "Unsupported stencil op"); - return STENCILOP_KEEP; - } -} - -static INLINE SWR_FORMAT -swr_convert_index_type(const UINT index_size) -{ - switch (index_size) { - case sizeof(unsigned char): - return R8_UINT; - case sizeof(unsigned short): - return R16_UINT; - case sizeof(unsigned int): - return R32_UINT; - default: - assert(0 && "Unsupported index type"); - return R32_UINT; - } -} - - -static INLINE SWR_ZFUNCTION -swr_convert_depth_func(const UINT pipe_func) -{ - switch (pipe_func) { - case PIPE_FUNC_NEVER: - return ZFUNC_NEVER; - case PIPE_FUNC_LESS: - return ZFUNC_LT; - case PIPE_FUNC_EQUAL: - return ZFUNC_EQ; - case PIPE_FUNC_LEQUAL: - return ZFUNC_LE; - case PIPE_FUNC_GREATER: - return ZFUNC_GT; - case PIPE_FUNC_NOTEQUAL: - return ZFUNC_NE; - case PIPE_FUNC_GEQUAL: - return ZFUNC_GE; - case PIPE_FUNC_ALWAYS: - return ZFUNC_ALWAYS; - default: - assert(0 && "Unsupported depth func"); - return ZFUNC_ALWAYS; - } -} - - -static INLINE SWR_CULLMODE -swr_convert_cull_mode(const UINT cull_face) -{ - switch (cull_face) { - case PIPE_FACE_NONE: - return SWR_CULLMODE_NONE; - case PIPE_FACE_FRONT: - return SWR_CULLMODE_FRONT; - case PIPE_FACE_BACK: - return SWR_CULLMODE_BACK; - case PIPE_FACE_FRONT_AND_BACK: - return SWR_CULLMODE_BOTH; - default: - assert(0 && "Invalid cull mode"); - return SWR_CULLMODE_NONE; - } -} - -static INLINE SWR_BLEND_OP -swr_convert_blend_func(const UINT blend_func) -{ - switch (blend_func) { - case PIPE_BLEND_ADD: - return BLENDOP_ADD; - case PIPE_BLEND_SUBTRACT: - return BLENDOP_SUBTRACT; - case PIPE_BLEND_REVERSE_SUBTRACT: - return BLENDOP_REVSUBTRACT; - case PIPE_BLEND_MIN: - return BLENDOP_MIN; - case PIPE_BLEND_MAX: - return BLENDOP_MAX; - default: - assert(0 && "Invalid blend func"); - return BLENDOP_ADD; - } -} - -static INLINE SWR_BLEND_FACTOR -swr_convert_blend_factor(const UINT blend_factor) -{ - switch (blend_factor) { - case PIPE_BLENDFACTOR_ONE: - return BLENDFACTOR_ONE; - case PIPE_BLENDFACTOR_SRC_COLOR: - return BLENDFACTOR_SRC_COLOR; - case PIPE_BLENDFACTOR_SRC_ALPHA: - return BLENDFACTOR_SRC_ALPHA; - case PIPE_BLENDFACTOR_DST_ALPHA: - return BLENDFACTOR_DST_ALPHA; - case PIPE_BLENDFACTOR_DST_COLOR: - return BLENDFACTOR_DST_COLOR; - case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: - return BLENDFACTOR_SRC_ALPHA_SATURATE; - case PIPE_BLENDFACTOR_CONST_COLOR: - return BLENDFACTOR_CONST_COLOR; - case PIPE_BLENDFACTOR_CONST_ALPHA: - return BLENDFACTOR_CONST_ALPHA; - case PIPE_BLENDFACTOR_SRC1_COLOR: - return BLENDFACTOR_SRC1_COLOR; - case PIPE_BLENDFACTOR_SRC1_ALPHA: - return BLENDFACTOR_SRC1_ALPHA; - case PIPE_BLENDFACTOR_ZERO: - return BLENDFACTOR_ZERO; - case PIPE_BLENDFACTOR_INV_SRC_COLOR: - return BLENDFACTOR_INV_SRC_COLOR; - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - return BLENDFACTOR_INV_SRC_ALPHA; - case PIPE_BLENDFACTOR_INV_DST_ALPHA: - return BLENDFACTOR_INV_DST_ALPHA; - case PIPE_BLENDFACTOR_INV_DST_COLOR: - return BLENDFACTOR_INV_DST_COLOR; - case PIPE_BLENDFACTOR_INV_CONST_COLOR: - return BLENDFACTOR_INV_CONST_COLOR; - case PIPE_BLENDFACTOR_INV_CONST_ALPHA: - return BLENDFACTOR_INV_CONST_ALPHA; - case PIPE_BLENDFACTOR_INV_SRC1_COLOR: - return BLENDFACTOR_INV_SRC1_COLOR; - case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: - return BLENDFACTOR_INV_SRC1_ALPHA; - default: - assert(0 && "Invalid blend factor"); - return BLENDFACTOR_ONE; - } -} - -static INLINE enum SWR_SURFACE_TYPE -swr_convert_target_type(const enum pipe_texture_target target) -{ - switch (target) { - case PIPE_BUFFER: - return SURFACE_BUFFER; - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - return SURFACE_1D; - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_2D_ARRAY: - case PIPE_TEXTURE_RECT: - return SURFACE_2D; - case PIPE_TEXTURE_3D: - return SURFACE_3D; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - return SURFACE_CUBE; - default: - assert(0); - return SURFACE_NULL; - } -} - -/* - * Convert mesa PIPE_PRIM_X to SWR enum PRIMITIVE_TOPOLOGY - */ -static INLINE enum PRIMITIVE_TOPOLOGY -swr_convert_prim_topology(const unsigned mode, const unsigned tcs_verts) -{ - switch (mode) { - case PIPE_PRIM_POINTS: - return TOP_POINT_LIST; - case PIPE_PRIM_LINES: - return TOP_LINE_LIST; - case PIPE_PRIM_LINE_LOOP: - return TOP_LINE_LOOP; - case PIPE_PRIM_LINE_STRIP: - return TOP_LINE_STRIP; - case PIPE_PRIM_TRIANGLES: - return TOP_TRIANGLE_LIST; - case PIPE_PRIM_TRIANGLE_STRIP: - return TOP_TRIANGLE_STRIP; - case PIPE_PRIM_TRIANGLE_FAN: - return TOP_TRIANGLE_FAN; - case PIPE_PRIM_QUADS: - return TOP_QUAD_LIST; - case PIPE_PRIM_QUAD_STRIP: - return TOP_QUAD_STRIP; - case PIPE_PRIM_POLYGON: - return TOP_TRIANGLE_FAN; /* XXX TOP_POLYGON; */ - case PIPE_PRIM_LINES_ADJACENCY: - return TOP_LINE_LIST_ADJ; - case PIPE_PRIM_LINE_STRIP_ADJACENCY: - return TOP_LISTSTRIP_ADJ; - case PIPE_PRIM_TRIANGLES_ADJACENCY: - return TOP_TRI_LIST_ADJ; - case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: - return TOP_TRI_STRIP_ADJ; - case PIPE_PRIM_PATCHES: - // rasterizer has a separate type for each possible number of patch vertices - return (PRIMITIVE_TOPOLOGY)((unsigned)TOP_PATCHLIST_BASE + tcs_verts); - default: - assert(0 && "Unknown topology"); - return TOP_UNKNOWN; - } -}; - -/* - * convert mesa PIPE_POLYGON_MODE_X to SWR enum SWR_FILLMODE - */ -static INLINE enum SWR_FILLMODE -swr_convert_fill_mode(const unsigned mode) -{ - switch(mode) { - case PIPE_POLYGON_MODE_FILL: - return SWR_FILLMODE_SOLID; - case PIPE_POLYGON_MODE_LINE: - return SWR_FILLMODE_WIREFRAME; - case PIPE_POLYGON_MODE_POINT: - return SWR_FILLMODE_POINT; - default: - assert(0 && "Unknown fillmode"); - return SWR_FILLMODE_SOLID; // at least do something sensible - } -} - - -#endif diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp deleted file mode 100644 index 1cf00b29249..00000000000 --- a/src/gallium/drivers/swr/swr_tex_sample.cpp +++ /dev/null @@ -1,376 +0,0 @@ -/************************************************************************** - * - * Copyright 2009 VMware, Inc. - * All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/** - * Largely a copy of llvmpipe's lp_tex_sample.c - */ - -/** - * Texture sampling code generation - * - * This file is nothing more than ugly glue between three largely independent - * entities: - * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa) - * - texture sampling code generation (i.e., lp_build_sample_soa) - * - SWR driver - * - * All interesting code is in the functions mentioned above. There is really - * nothing to see here. - * - * @author Jose Fonseca <jfonseca@vmware.com> - */ - -#include "state.h" -#include "JitManager.h" -#include "gen_state_llvm.h" - -#include "pipe/p_defines.h" -#include "pipe/p_shader_tokens.h" -#include "gallivm/lp_bld_debug.h" -#include "gallivm/lp_bld_const.h" -#include "gallivm/lp_bld_type.h" -#include "gallivm/lp_bld_sample.h" -#include "gallivm/lp_bld_tgsi.h" -#include "util/u_memory.h" - -#include "swr_tex_sample.h" -#include "gen_surf_state_llvm.h" -#include "gen_swr_context_llvm.h" - -using namespace SwrJit; - -/** - * This provides the bridge between the sampler state store in - * lp_jit_context and lp_jit_texture and the sampler code - * generator. It provides the texture layout information required by - * the texture sampler code generator in terms of the state stored in - * lp_jit_context and lp_jit_texture in runtime. - */ -struct swr_sampler_dynamic_state { - struct lp_sampler_dynamic_state base; - - const struct swr_sampler_static_state *static_state; - - enum pipe_shader_type shader_type; -}; - - -/** - * This is the bridge between our sampler and the TGSI translator. - */ -struct swr_sampler_soa { - struct lp_build_sampler_soa base; - - struct swr_sampler_dynamic_state dynamic_state; -}; - - -/** - * Fetch the specified member of the lp_jit_texture structure. - * \param emit_load if TRUE, emit the LLVM load instruction to actually - * fetch the field's value. Otherwise, just emit the - * GEP code to address the field. - * - * @sa http://llvm.org/docs/GetElementPtr.html - */ -static LLVMValueRef -swr_texture_member(const struct lp_sampler_dynamic_state *base, - struct gallivm_state *gallivm, - LLVMValueRef context_ptr, - unsigned texture_unit, - unsigned member_index, - const char *member_name, - boolean emit_load) -{ - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef indices[4]; - LLVMValueRef ptr; - LLVMValueRef res; - - assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS); - - /* context[0] */ - indices[0] = lp_build_const_int32(gallivm, 0); - /* context[0].textures */ - auto dynamic = (const struct swr_sampler_dynamic_state *)base; - switch (dynamic->shader_type) { - case PIPE_SHADER_FRAGMENT: - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS); - break; - case PIPE_SHADER_VERTEX: - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesVS); - break; - case PIPE_SHADER_GEOMETRY: - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesGS); - break; - case PIPE_SHADER_TESS_CTRL: - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesTCS); - break; - case PIPE_SHADER_TESS_EVAL: - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesTES); - break; - default: - assert(0 && "unsupported shader type"); - break; - } - /* context[0].textures[unit] */ - indices[2] = lp_build_const_int32(gallivm, texture_unit); - /* context[0].textures[unit].member */ - indices[3] = lp_build_const_int32(gallivm, member_index); - - ptr = LLVMBuildGEP(builder, context_ptr, indices, ARRAY_SIZE(indices), ""); - - if (emit_load) - res = LLVMBuildLoad(builder, ptr, ""); - else - res = ptr; - - lp_build_name(res, "context.texture%u.%s", texture_unit, member_name); - - return res; -} - - -/** - * Helper macro to instantiate the functions that generate the code to - * fetch the members of lp_jit_texture to fulfill the sampler code - * generator requests. - * - * This complexity is the price we have to pay to keep the texture - * sampler code generator a reusable module without dependencies to - * swr internals. - */ -#define SWR_TEXTURE_MEMBER(_name, _emit_load) \ - static LLVMValueRef swr_texture_##_name( \ - const struct lp_sampler_dynamic_state *base, \ - struct gallivm_state *gallivm, \ - LLVMValueRef context_ptr, \ - unsigned texture_unit, \ - LLVMValueRef texture_unit_offset) \ - { \ - return swr_texture_member(base, \ - gallivm, \ - context_ptr, \ - texture_unit, \ - swr_jit_texture_##_name, \ - #_name, \ - _emit_load); \ - } - - -SWR_TEXTURE_MEMBER(width, TRUE) -SWR_TEXTURE_MEMBER(height, TRUE) -SWR_TEXTURE_MEMBER(depth, TRUE) -SWR_TEXTURE_MEMBER(first_level, TRUE) -SWR_TEXTURE_MEMBER(last_level, TRUE) -SWR_TEXTURE_MEMBER(base_ptr, TRUE) -SWR_TEXTURE_MEMBER(num_samples, TRUE) -SWR_TEXTURE_MEMBER(sample_stride, TRUE) -SWR_TEXTURE_MEMBER(row_stride, FALSE) -SWR_TEXTURE_MEMBER(img_stride, FALSE) -SWR_TEXTURE_MEMBER(mip_offsets, FALSE) - - -/** - * Fetch the specified member of the lp_jit_sampler structure. - * \param emit_load if TRUE, emit the LLVM load instruction to actually - * fetch the field's value. Otherwise, just emit the - * GEP code to address the field. - * - * @sa http://llvm.org/docs/GetElementPtr.html - */ -static LLVMValueRef -swr_sampler_member(const struct lp_sampler_dynamic_state *base, - struct gallivm_state *gallivm, - LLVMValueRef context_ptr, - unsigned sampler_unit, - unsigned member_index, - const char *member_name, - boolean emit_load) -{ - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef indices[4]; - LLVMValueRef ptr; - LLVMValueRef res; - - assert(sampler_unit < PIPE_MAX_SAMPLERS); - - /* context[0] */ - indices[0] = lp_build_const_int32(gallivm, 0); - /* context[0].samplers */ - auto dynamic = (const struct swr_sampler_dynamic_state *)base; - switch (dynamic->shader_type) { - case PIPE_SHADER_FRAGMENT: - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS); - break; - case PIPE_SHADER_VERTEX: - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersVS); - break; - case PIPE_SHADER_GEOMETRY: - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersGS); - break; - case PIPE_SHADER_TESS_CTRL: - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersTCS); - break; - case PIPE_SHADER_TESS_EVAL: - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersTES); - break; - default: - assert(0 && "unsupported shader type"); - break; - } - /* context[0].samplers[unit] */ - indices[2] = lp_build_const_int32(gallivm, sampler_unit); - /* context[0].samplers[unit].member */ - indices[3] = lp_build_const_int32(gallivm, member_index); - - ptr = LLVMBuildGEP(builder, context_ptr, indices, ARRAY_SIZE(indices), ""); - - if (emit_load) - res = LLVMBuildLoad(builder, ptr, ""); - else - res = ptr; - - lp_build_name(res, "context.sampler%u.%s", sampler_unit, member_name); - - return res; -} - - -#define SWR_SAMPLER_MEMBER(_name, _emit_load) \ - static LLVMValueRef swr_sampler_##_name( \ - const struct lp_sampler_dynamic_state *base, \ - struct gallivm_state *gallivm, \ - LLVMValueRef context_ptr, \ - unsigned sampler_unit) \ - { \ - return swr_sampler_member(base, \ - gallivm, \ - context_ptr, \ - sampler_unit, \ - swr_jit_sampler_##_name, \ - #_name, \ - _emit_load); \ - } - - -SWR_SAMPLER_MEMBER(min_lod, TRUE) -SWR_SAMPLER_MEMBER(max_lod, TRUE) -SWR_SAMPLER_MEMBER(lod_bias, TRUE) -SWR_SAMPLER_MEMBER(border_color, FALSE) - - -static void -swr_sampler_soa_destroy(struct lp_build_sampler_soa *sampler) -{ - FREE(sampler); -} - - -/** - * Fetch filtered values from texture. - * The 'texel' parameter returns four vectors corresponding to R, G, B, A. - */ -static void -swr_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, - struct gallivm_state *gallivm, - const struct lp_sampler_params *params) -{ - struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base; - unsigned texture_index = params->texture_index; - unsigned sampler_index = params->sampler_index; - - assert(sampler_index < PIPE_MAX_SAMPLERS); - assert(texture_index < PIPE_MAX_SHADER_SAMPLER_VIEWS); - -#if 0 - lp_build_sample_nop(gallivm, params->type, params->coords, params->texel); -#else - lp_build_sample_soa( - &sampler->dynamic_state.static_state[texture_index].texture_state, - &sampler->dynamic_state.static_state[sampler_index].sampler_state, - &sampler->dynamic_state.base, - gallivm, - params); -#endif -} - -/** - * Fetch the texture size. - */ -static void -swr_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base, - struct gallivm_state *gallivm, - const struct lp_sampler_size_query_params *params) -{ - struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base; - - assert(params->texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS); - - lp_build_size_query_soa( - gallivm, - &sampler->dynamic_state.static_state[params->texture_unit].texture_state, - &sampler->dynamic_state.base, - params); -} - - -struct lp_build_sampler_soa * -swr_sampler_soa_create(const struct swr_sampler_static_state *static_state, - enum pipe_shader_type shader_type) -{ - struct swr_sampler_soa *sampler; - - sampler = CALLOC_STRUCT(swr_sampler_soa); - if (!sampler) - return NULL; - - sampler->base.destroy = swr_sampler_soa_destroy; - sampler->base.emit_tex_sample = swr_sampler_soa_emit_fetch_texel; - sampler->base.emit_size_query = swr_sampler_soa_emit_size_query; - sampler->dynamic_state.base.width = swr_texture_width; - sampler->dynamic_state.base.height = swr_texture_height; - sampler->dynamic_state.base.depth = swr_texture_depth; - sampler->dynamic_state.base.first_level = swr_texture_first_level; - sampler->dynamic_state.base.last_level = swr_texture_last_level; - sampler->dynamic_state.base.base_ptr = swr_texture_base_ptr; - sampler->dynamic_state.base.row_stride = swr_texture_row_stride; - sampler->dynamic_state.base.img_stride = swr_texture_img_stride; - sampler->dynamic_state.base.mip_offsets = swr_texture_mip_offsets; - sampler->dynamic_state.base.num_samples = swr_texture_num_samples; - sampler->dynamic_state.base.sample_stride = swr_texture_sample_stride; - sampler->dynamic_state.base.min_lod = swr_sampler_min_lod; - sampler->dynamic_state.base.max_lod = swr_sampler_max_lod; - sampler->dynamic_state.base.lod_bias = swr_sampler_lod_bias; - sampler->dynamic_state.base.border_color = swr_sampler_border_color; - - sampler->dynamic_state.static_state = static_state; - - sampler->dynamic_state.shader_type = shader_type; - - return &sampler->base; -} diff --git a/src/gallium/drivers/swr/swr_tex_sample.h b/src/gallium/drivers/swr/swr_tex_sample.h deleted file mode 100644 index 715ca3c3e19..00000000000 --- a/src/gallium/drivers/swr/swr_tex_sample.h +++ /dev/null @@ -1,48 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#pragma once - -#include "gallivm/lp_bld.h" - -struct swr_sampler_static_state { - /* - * These attributes are effectively interleaved for more sane key handling. - * However, there might be lots of null space if the amount of samplers and - * textures isn't the same. - */ - struct lp_static_sampler_state sampler_state; - struct lp_static_texture_state texture_state; -}; - -/** - * Pure-LLVM texture sampling code generator. - * - */ -struct lp_build_sampler_soa * -swr_sampler_soa_create(const struct swr_sampler_static_state *key, - enum pipe_shader_type shader_type); |