gallium/swr: Remove driver source

The OpenSWR will be maintained on a classic/LTS branch. Reviewed-by: Dylan Baker <dylan@pnwbakers.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11264>
author: Jan Zielinski <jan.zielinski@intel.com> 2021-06-09 13:19:44 +0200
committer: Marge Bot <emma+marge@anholt.net> 2021-12-06 23:37:50 +0000
commit: 855793c6c6bd372ea96681ecbd3f318ad71da223 (patch)
tree: cbd8efc0c9df58d3bdc2ba774cf46dcdcad21162
parent: d22d328859e4a67e6ff738fbd22eaf1d5a09376a (diff)
178 files changed, 0 insertions, 85594 deletions
diff --git a/src/gallium/drivers/swr/.clang-format b/src/gallium/drivers/swr/.clang-format
deleted file mode 100644
index 0ec65a5de88..00000000000
--- a/src/gallium/drivers/swr/.clang-format
+++ /dev/null
@@ -1,64 +0,0 @@
----
-Language:        Cpp
-AccessModifierOffset: -3
-AlignAfterOpenBracket: true
-AlignEscapedNewlinesLeft: false
-AlignOperands:   false
-AlignTrailingComments: false
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AlwaysBreakAfterDefinitionReturnType: true
-AlwaysBreakTemplateDeclarations: false
-AlwaysBreakBeforeMultilineStrings: false
-BreakBeforeBinaryOperators: NonAssignment
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: true
-BinPackParameters: false
-BinPackArguments: false
-ColumnLimit:     78
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 3
-DerivePointerAlignment: false
-ExperimentalAutoDetectBinPacking: false
-IndentCaseLabels: false
-IndentWrappedFunctionNames: false
-IndentFunctionDeclarationAfterType: false
-MaxEmptyLinesToKeep: 2
-KeepEmptyLinesAtTheStartOfBlocks: true
-NamespaceIndentation: Inner
-ObjCBlockIndentWidth: 3
-ObjCSpaceAfterProperty: true
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakString: 1000
-PenaltyBreakFirstLessLess: 120
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 0
-PointerAlignment: Right
-SpacesBeforeTrailingComments: 1
-Cpp11BracedListStyle: true
-Standard:        Cpp11
-IndentWidth:     3
-TabWidth:        8
-UseTab:          Never
-BreakBeforeBraces: Linux
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-SpacesInAngles:  false
-SpaceInEmptyParentheses: false
-SpacesInCStyleCastParentheses: false
-SpaceAfterCStyleCast: false
-SpacesInContainerLiterals: true
-SpaceBeforeAssignmentOperators: true
-ContinuationIndentWidth: 3
-CommentPragmas:  '^ IWYU pragma:'
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
-SpaceBeforeParens: ControlStatements
-DisableFormat:   false
-...
-
diff --git a/src/gallium/drivers/swr/meson.build b/src/gallium/drivers/swr/meson.build
deleted file mode 100644
index ac712d80461..00000000000
--- a/src/gallium/drivers/swr/meson.build
+++ /dev/null
@@ -1,411 +0,0 @@
-# Copyright © 2017-2020 Intel Corporation
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-files_swr_common = files(
-  'rasterizer/common/formats.cpp',
-  'rasterizer/common/formats.h',
-  'rasterizer/common/intrin.h',
-  'rasterizer/common/isa.hpp',
-  'rasterizer/common/os.cpp',
-  'rasterizer/common/os.h',
-  'rasterizer/common/rdtsc_buckets.cpp',
-  'rasterizer/common/rdtsc_buckets.h',
-  'rasterizer/common/rdtsc_buckets_shared.h',
-  'rasterizer/common/rdtsc_buckets_shared.h',
-  'rasterizer/common/simd16intrin.h',
-  'rasterizer/common/simdintrin.h',
-  'rasterizer/common/simdlib.hpp',
-  'rasterizer/common/simdlib_interface.hpp',
-  'rasterizer/common/simdlib_types.hpp',
-  'rasterizer/common/swr_assert.cpp',
-  'rasterizer/common/swr_assert.h',
-)
-
-files_swr_mesa = files(
-  'swr_loader.cpp',
-  'swr_clear.cpp',
-  'swr_context.cpp',
-  'swr_context.h',
-  'swr_draw.cpp',
-  'swr_public.h',
-  'swr_resource.h',
-  'swr_screen.cpp',
-  'swr_screen.h',
-  'swr_state.cpp',
-  'swr_state.h',
-  'swr_tex_sample.cpp',
-  'swr_tex_sample.h',
-  'swr_scratch.h',
-  'swr_scratch.cpp',
-  'swr_shader.cpp',
-  'swr_shader.h',
-  'swr_memory.h',
-  'swr_fence.h',
-  'swr_fence.cpp',
-  'swr_fence_work.h',
-  'swr_fence_work.cpp',
-  'swr_query.h',
-  'swr_query.cpp',
-  'rasterizer/jitter/blend_jit.cpp',
-  'rasterizer/jitter/blend_jit.h',
-  'rasterizer/jitter/builder.cpp',
-  'rasterizer/jitter/builder.h',
-  'rasterizer/jitter/builder_math.h',
-  'rasterizer/jitter/builder_mem.cpp',
-  'rasterizer/jitter/builder_mem.h',
-  'rasterizer/jitter/builder_gfx_mem.cpp',
-  'rasterizer/jitter/builder_gfx_mem.h',
-  'rasterizer/jitter/builder_misc.cpp',
-  'rasterizer/jitter/builder_misc.h',
-  'rasterizer/jitter/fetch_jit.cpp',
-  'rasterizer/jitter/fetch_jit.h',
-  'rasterizer/jitter/jit_api.h',
-  'rasterizer/jitter/JitManager.cpp',
-  'rasterizer/jitter/JitManager.h',
-  'rasterizer/jitter/streamout_jit.cpp',
-  'rasterizer/jitter/streamout_jit.h',
-  'rasterizer/jitter/shader_lib/DebugOutput.cpp',
-  'rasterizer/jitter/shader_lib/Scatter.cpp',
-  'rasterizer/jitter/functionpasses/lower_x86.cpp',
-  'rasterizer/memory/SurfaceState.h'
-)
-
-files_swr_arch = files(
-  'rasterizer/archrast/archrast.cpp',
-  'rasterizer/archrast/archrast.h',
-  'rasterizer/archrast/eventmanager.h',
-  'rasterizer/core/api.cpp',
-  'rasterizer/core/api.h',
-  'rasterizer/core/arena.h',
-  'rasterizer/core/backend.cpp',
-  'rasterizer/core/backend_clear.cpp',
-  'rasterizer/core/backend_sample.cpp',
-  'rasterizer/core/backend_singlesample.cpp',
-  'rasterizer/core/backend.h',
-  'rasterizer/core/backend_impl.h',
-  'rasterizer/core/binner.cpp',
-  'rasterizer/core/binner.h',
-  'rasterizer/core/blend.h',
-  'rasterizer/core/clip.cpp',
-  'rasterizer/core/clip.h',
-  'rasterizer/core/conservativeRast.h',
-  'rasterizer/core/context.h',
-  'rasterizer/core/depthstencil.h',
-  'rasterizer/core/fifo.hpp',
-  'rasterizer/core/format_conversion.h',
-  'rasterizer/core/format_traits.h',
-  'rasterizer/core/format_types.h',
-  'rasterizer/core/format_utils.h',
-  'rasterizer/core/frontend.cpp',
-  'rasterizer/core/frontend.h',
-  'rasterizer/core/knobs.h',
-  'rasterizer/core/knobs_init.h',
-  'rasterizer/core/multisample.h',
-  'rasterizer/core/pa_avx.cpp',
-  'rasterizer/core/pa.h',
-  'rasterizer/core/rasterizer.cpp',
-  'rasterizer/core/rasterizer.h',
-  'rasterizer/core/rasterizer_impl.h',
-  'rasterizer/core/rdtsc_core.cpp',
-  'rasterizer/core/rdtsc_core.h',
-  'rasterizer/core/ringbuffer.h',
-  'rasterizer/core/state.h',
-  'rasterizer/core/state_funcs.h',
-  'rasterizer/core/tessellator.h',
-  'rasterizer/core/tessellator.hpp',
-  'rasterizer/core/tessellator.cpp',
-  'rasterizer/core/threads.cpp',
-  'rasterizer/core/threads.h',
-  'rasterizer/core/tilemgr.cpp',
-  'rasterizer/core/tilemgr.h',
-  'rasterizer/core/tileset.h',
-  'rasterizer/core/utils.h',
-  'rasterizer/memory/ClearTile.cpp',
-  'rasterizer/memory/Convert.h',
-  'rasterizer/memory/LoadTile.cpp',
-  'rasterizer/memory/LoadTile.h',
-  'rasterizer/memory/LoadTile_Linear.cpp',
-  'rasterizer/memory/LoadTile_TileX.cpp',
-  'rasterizer/memory/LoadTile_TileY.cpp',
-  'rasterizer/memory/StoreTile.cpp',
-  'rasterizer/memory/StoreTile.h',
-  'rasterizer/memory/StoreTile_Linear2.cpp',
-  'rasterizer/memory/StoreTile_Linear.cpp',
-  'rasterizer/memory/StoreTile_TileW.cpp',
-  'rasterizer/memory/StoreTile_TileX2.cpp',
-  'rasterizer/memory/StoreTile_TileX.cpp',
-  'rasterizer/memory/StoreTile_TileY2.cpp',
-  'rasterizer/memory/StoreTile_TileY.cpp',
-  'rasterizer/memory/TilingFunctions.h',
-  'rasterizer/memory/tilingtraits.h',
-  'rasterizer/memory/InitMemory.h',
-  'rasterizer/memory/InitMemory.cpp',
-  'rasterizer/memory/SurfaceState.h'
-)
-
-swr_context_files = files('swr_context.h')
-swr_state_files = files('rasterizer/core/state.h')
-swr_surf_state_files = files('rasterizer/memory/SurfaceState.h')
-swr_event_proto_files = files('rasterizer/archrast/events.proto')
-swr_event_pproto_files = files('rasterizer/archrast/events_private.proto')
-swr_gen_backend_files = files('rasterizer/codegen/templates/gen_backend.cpp')
-swr_gen_rasterizer_files = files('rasterizer/codegen/templates/gen_rasterizer.cpp')
-swr_gen_header_init_files = files('rasterizer/codegen/templates/gen_header_init.hpp')
-
-swr_gen_llvm_ir_macros_py = files('rasterizer/codegen/gen_llvm_ir_macros.py')
-swr_gen_backends_py = files('rasterizer/codegen/gen_backends.py')
-
-swr_gen_builder_depends = files(
-    'rasterizer/codegen/templates/gen_builder.hpp',
-    'rasterizer/codegen/gen_common.py'
-    )
-
-
-subdir('rasterizer/jitter')
-subdir('rasterizer/codegen')
-subdir('rasterizer/core/backends')
-
-swr_incs = include_directories(
-  'rasterizer/codegen', 'rasterizer/core', 'rasterizer/jitter',
-  'rasterizer/archrast', 'rasterizer',
-)
-
-swr_cpp_args = []
-if cpp.has_argument('-fno-strict-aliasing')
-  swr_cpp_args += '-fno-strict-aliasing'
-endif
-if cpp.has_argument('-Wno-aligned-new')
-  swr_cpp_args += '-Wno-aligned-new'
-endif
-
-
-swr_arch_libs = []
-swr_defines = []
-
-swr_avx_args = cpp.first_supported_argument(
-  '-target-cpu=sandybridge', '-mavx', '-march=core-avx', '-tp=sandybridge',
-  '/arch:AVX',
-)
-if swr_avx_args == []
-  error('Cannot find AVX support for swr. (these are required for SWR an all architectures.)')
-endif
-
-shared_swr = get_option('shared-swr')
-if not shared_swr
-  if with_swr_arches.length() > 1
-    error('When SWR is linked statically only one architecture is allowed.')
-  endif
-  swr_defines += '-DHAVE_SWR_BUILTIN'
-endif
-
-if with_swr_arches.contains('skx')
-  swr_skx_args = cpp.first_supported_argument(
-    '-march=skylake-avx512', '-target-cpu=x86-skylake', '-xCORE-AVX512',
-  )
-  if swr_skx_args == []
-    error('Cannot find SKX support for swr.')
-  endif
-
-  swr_defines += '-DHAVE_SWR_SKX'
-  if shared_swr
-    swr_arch_libs += shared_library(
-      'swrSKX',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_skx_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX512',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-      version : '0.0.0',
-      soversion : host_machine.system() == 'windows' ? '' : '0',
-      install : true,
-      name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
-    )
-  else
-    swr_arch_libs += static_library(
-      'swrSKX',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_skx_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX512',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-    )
-  endif
-endif
-
-if with_swr_arches.contains('knl')
-  swr_knl_args = cpp.first_supported_argument(
-    '-march=knl', '-target-cpu=mic-knl', '-xMIC-AVX512',
-  )
-  if swr_knl_args == []
-    error('Cannot find KNL support for swr.')
-  endif
-
-  swr_defines += '-DHAVE_SWR_KNL'
-  if shared_swr
-    swr_arch_libs += shared_library(
-      'swrKNL',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_knl_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-      version : '0.0.0',
-      soversion : host_machine.system() == 'windows' ? '' : '0',
-      install : true,
-      name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
-    )
-  else
-    swr_arch_libs += static_library(
-      'swrKNL',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_knl_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-    )
-  endif
-endif
-
-
-if with_swr_arches.contains('avx2')
-  swr_avx2_args = cpp.first_supported_argument(
-    '-target-cpu=haswell', '-march=core-avx2', '-tp=haswell', '/arch:AVX2',
-  )
-  if swr_avx2_args == []
-    if cpp.has_argument(['-mavx2', '-mfma', '-mbmi2', '-mf16c'])
-      swr_avx2_args = ['-mavx2', '-mfma', '-mbmi2', '-mf16c']
-    else
-      error('Cannot find AVX2 support for swr.')
-    endif
-  endif
-
-  swr_defines += '-DHAVE_SWR_AVX2'
-  if shared_swr
-    swr_arch_libs += shared_library(
-      'swrAVX2',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX2',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-      version : '0.0.0',
-      soversion : host_machine.system() == 'windows' ? '' : '0',
-      install : true,
-      name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
-    )
-  else
-    swr_arch_libs += static_library(
-      'swrAVX2',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX2',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-    )
-  endif
-endif
-
-if with_swr_arches.contains('avx')
-  swr_defines += '-DHAVE_SWR_AVX'
-  if shared_swr
-    swr_arch_libs += shared_library(
-      'swrAVX',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-      version : '0.0.0',
-      soversion : host_machine.system() == 'windows' ? '' : '0',
-      install : true,
-      name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
-    )
-  else
-    swr_arch_libs += static_library(
-      'swrAVX',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-    )
-  endif
-endif
-
-
-if swr_arch_libs == []
-  error('SWR configured, but no SWR architectures configured')
-endif
-
-# The swr_avx_args are needed for intrensic usage in swr api headers.
-libmesaswr = static_library(
-  'mesaswr',
-  [files_swr_mesa, files_swr_common, gen_knobs_h, gen_knobs_cpp,
-   gen_builder_hpp, gen_builder_meta_hpp, gen_builder_intrin_hpp],
-  cpp_args : [
-    cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
-    swr_defines,
-  ],
-  gnu_symbol_visibility : 'hidden',
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, swr_incs],
-  dependencies : [dep_llvm, idep_mesautil],
-)
-
-link_libs = [libmesaswr]
-if not shared_swr
-  link_libs += swr_arch_libs
-endif
-
-driver_swr = declare_dependency(
-  compile_args : '-DGALLIUM_SWR',
-  link_with : link_libs
-)
diff --git a/src/gallium/drivers/swr/rasterizer/.dir-locals.el b/src/gallium/drivers/swr/rasterizer/.dir-locals.el
deleted file mode 100644
index 2b04c18a9bb..00000000000
--- a/src/gallium/drivers/swr/rasterizer/.dir-locals.el
+++ /dev/null
@@ -1,8 +0,0 @@
-((prog-mode
-  (c-basic-offset . 4)
-  (c-file-style . "k&r")
-  (fill-column . 78)
-  (indent-tabs-mode . nil)
-  (show-trailing-whitespace . t)
-  )
- )
diff --git a/src/gallium/drivers/swr/rasterizer/_clang-format b/src/gallium/drivers/swr/rasterizer/_clang-format
deleted file mode 100644
index ed4b9b409d8..00000000000
--- a/src/gallium/drivers/swr/rasterizer/_clang-format
+++ /dev/null
@@ -1,114 +0,0 @@
----
-Language:        Cpp
-# BasedOnStyle:  LLVM
-AccessModifierOffset: -4
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: true
-AlignConsecutiveDeclarations: true
-AlignEscapedNewlines: Left
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: false
-BinPackParameters: false
-BraceWrapping:   
-  AfterClass:      true
-  AfterControlStatement: true
-  AfterEnum:       true
-  AfterFunction:   true
-  AfterNamespace:  true
-  AfterObjCDeclaration: true
-  AfterStruct:     true
-  AfterUnion:      true
-  #AfterExternBlock: false
-  BeforeCatch:     true
-  BeforeElse:      true
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Custom
-BreakBeforeInheritanceComma: false
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: AfterColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     100
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:   
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-#IncludeBlocks:   Preserve
-IncludeCategories: 
-  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
-    Priority:        2
-  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
-    Priority:        3
-  - Regex:           '.*'
-    Priority:        1
-IncludeIsMainRegex: '(Test)?$'
-IndentCaseLabels: false
-#IndentPPDirectives: AfterHash
-IndentWidth:     4
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: All
-ObjCBlockIndentWidth: 4
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 60
-PointerAlignment: Left
-#RawStringFormats: 
-#  - Delimiter:       pb
-#    Language:        TextProto
-#    BasedOnStyle:    google
-ReflowComments:  true
-SortIncludes:    false
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-TabWidth:        4
-UseTab:          Never
-...
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
deleted file mode 100644
index bcdc6d01358..00000000000
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
+++ /dev/null
@@ -1,708 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file archrast.cpp
- *
- * @brief Implementation for archrast.
- *
- ******************************************************************************/
-#include <sys/stat.h>
-
-#include <atomic>
-#include <map>
-
-#include "common/os.h"
-#include "archrast/archrast.h"
-#include "archrast/eventmanager.h"
-#include "gen_ar_event.hpp"
-#include "gen_ar_eventhandlerfile.hpp"
-
-namespace ArchRast
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief struct that keeps track of depth and stencil event information
-    struct DepthStencilStats
-    {
-        uint32_t earlyZTestPassCount       = 0;
-        uint32_t earlyZTestFailCount       = 0;
-        uint32_t lateZTestPassCount        = 0;
-        uint32_t lateZTestFailCount        = 0;
-        uint32_t earlyStencilTestPassCount = 0;
-        uint32_t earlyStencilTestFailCount = 0;
-        uint32_t lateStencilTestPassCount  = 0;
-        uint32_t lateStencilTestFailCount  = 0;
-    };
-
-    struct CStats
-    {
-        uint32_t trivialRejectCount;
-        uint32_t trivialAcceptCount;
-        uint32_t mustClipCount;
-    };
-
-    struct TEStats
-    {
-        uint32_t inputPrims = 0;
-        //@todo:: Change this to numPatches. Assumed: 1 patch per prim. If holds, its fine.
-    };
-
-    struct GSStateInfo
-    {
-        uint32_t inputPrimCount;
-        uint32_t primGeneratedCount;
-        uint32_t vertsInput;
-    };
-
-    struct RastStats
-    {
-        uint32_t rasterTiles = 0;
-    };
-
-    struct CullStats
-    {
-        uint32_t degeneratePrimCount = 0;
-        uint32_t backfacePrimCount   = 0;
-    };
-
-    struct AlphaStats
-    {
-        uint32_t alphaTestCount  = 0;
-        uint32_t alphaBlendCount = 0;
-    };
-
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Event handler that handles API thread events. This is shared
-    ///        between the API and its caller (e.g. driver shim) but typically
-    ///        there is only a single API thread per context. So you can save
-    ///        information in the class to be used for other events.
-    class EventHandlerApiStats : public EventHandlerFile
-    {
-    public:
-        EventHandlerApiStats(uint32_t id) : EventHandlerFile(id)
-        {
-#if defined(_WIN32)
-            // Attempt to copy the events.proto file to the ArchRast output dir. It's common for
-            // tools to place the events.proto file in the DEBUG_OUTPUT_DIR when launching AR. If it
-            // exists, this will attempt to copy it the first time we get here to package it with
-            // the stats. Otherwise, the user would need to specify the events.proto location when
-            // parsing the stats in post.
-            std::stringstream eventsProtoSrcFilename, eventsProtoDstFilename;
-            eventsProtoSrcFilename << KNOB_DEBUG_OUTPUT_DIR << "\\events.proto" << std::ends;
-            eventsProtoDstFilename << mOutputDir.substr(0, mOutputDir.size() - 1)
-                                   << "\\events.proto" << std::ends;
-
-            // If event.proto already exists, we're done; else do the copy
-            struct stat buf; // Use a Posix stat for file existence check
-            if (!stat(eventsProtoDstFilename.str().c_str(), &buf) == 0)
-            {
-                // Now check to make sure the events.proto source exists
-                if (stat(eventsProtoSrcFilename.str().c_str(), &buf) == 0)
-                {
-                    std::ifstream srcFile;
-                    srcFile.open(eventsProtoSrcFilename.str().c_str(), std::ios::binary);
-                    if (srcFile.is_open())
-                    {
-                        // Just do a binary buffer copy
-                        std::ofstream dstFile;
-                        dstFile.open(eventsProtoDstFilename.str().c_str(), std::ios::binary);
-                        dstFile << srcFile.rdbuf();
-                        dstFile.close();
-                    }
-                    srcFile.close();
-                }
-            }
-#endif
-        }
-
-        virtual void Handle(const DrawInstancedEvent& event)
-        {
-            DrawInfoEvent e(event.data.drawId,
-                            ArchRast::Instanced,
-                            event.data.topology,
-                            event.data.numVertices,
-                            0,
-                            0,
-                            event.data.startVertex,
-                            event.data.numInstances,
-                            event.data.startInstance,
-                            event.data.tsEnable,
-                            event.data.gsEnable,
-                            event.data.soEnable,
-                            event.data.soTopology,
-                            event.data.splitId);
-
-            EventHandlerFile::Handle(e);
-        }
-
-        virtual void Handle(const DrawIndexedInstancedEvent& event)
-        {
-            DrawInfoEvent e(event.data.drawId,
-                            ArchRast::IndexedInstanced,
-                            event.data.topology,
-                            0,
-                            event.data.numIndices,
-                            event.data.indexOffset,
-                            event.data.baseVertex,
-                            event.data.numInstances,
-                            event.data.startInstance,
-                            event.data.tsEnable,
-                            event.data.gsEnable,
-                            event.data.soEnable,
-                            event.data.soTopology,
-                            event.data.splitId);
-
-            EventHandlerFile::Handle(e);
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Event handler that handles worker thread events. There is one
-    ///        event handler per thread. The python script will need to sum
-    ///        up counters across all of the threads.
-    class EventHandlerWorkerStats : public EventHandlerFile
-    {
-    public:
-        EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false)
-        {
-            memset(mShaderStats, 0, sizeof(mShaderStats));
-        }
-
-        virtual void Handle(const EarlyDepthStencilInfoSingleSample& event)
-        {
-            // earlyZ test compute
-            mDSSingleSample.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSingleSample.earlyZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // earlyStencil test compute
-            mDSSingleSample.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSingleSample.earlyStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            // earlyZ test single and multi sample
-            mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.earlyZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // earlyStencil test single and multi sample
-            mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.earlyStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const EarlyDepthStencilInfoSampleRate& event)
-        {
-            // earlyZ test compute
-            mDSSampleRate.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSampleRate.earlyZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // earlyStencil test compute
-            mDSSampleRate.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSampleRate.earlyStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            // earlyZ test single and multi sample
-            mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.earlyZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // earlyStencil test single and multi sample
-            mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.earlyStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const EarlyDepthStencilInfoNullPS& event)
-        {
-            // earlyZ test compute
-            mDSNullPS.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSNullPS.earlyZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // earlyStencil test compute
-            mDSNullPS.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSNullPS.earlyStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const LateDepthStencilInfoSingleSample& event)
-        {
-            // lateZ test compute
-            mDSSingleSample.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSingleSample.lateZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // lateStencil test compute
-            mDSSingleSample.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSingleSample.lateStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            // lateZ test single and multi sample
-            mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.lateZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // lateStencil test single and multi sample
-            mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.lateStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const LateDepthStencilInfoSampleRate& event)
-        {
-            // lateZ test compute
-            mDSSampleRate.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSampleRate.lateZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // lateStencil test compute
-            mDSSampleRate.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSampleRate.lateStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            // lateZ test single and multi sample
-            mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.lateZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // lateStencil test single and multi sample
-            mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.lateStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const LateDepthStencilInfoNullPS& event)
-        {
-            // lateZ test compute
-            mDSNullPS.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSNullPS.lateZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // lateStencil test compute
-            mDSNullPS.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSNullPS.lateStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const EarlyDepthInfoPixelRate& event)
-        {
-            // earlyZ test compute
-            mDSPixelRate.earlyZTestPassCount += event.data.depthPassCount;
-            mDSPixelRate.earlyZTestFailCount +=
-                (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
-            mNeedFlush = true;
-        }
-
-
-        virtual void Handle(const LateDepthInfoPixelRate& event)
-        {
-            // lateZ test compute
-            mDSPixelRate.lateZTestPassCount += event.data.depthPassCount;
-            mDSPixelRate.lateZTestFailCount +=
-                (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
-            mNeedFlush = true;
-        }
-
-
-        virtual void Handle(const ClipInfoEvent& event)
-        {
-            mClipper.mustClipCount += _mm_popcnt_u32(event.data.clipMask);
-            mClipper.trivialRejectCount +=
-                event.data.numInvocations - _mm_popcnt_u32(event.data.validMask);
-            mClipper.trivialAcceptCount +=
-                _mm_popcnt_u32(event.data.validMask & ~event.data.clipMask);
-        }
-
-        void UpdateStats(SWR_SHADER_STATS* pStatTotals, const SWR_SHADER_STATS* pStatUpdate)
-        {
-            pStatTotals->numInstExecuted += pStatUpdate->numInstExecuted;
-            pStatTotals->numSampleExecuted += pStatUpdate->numSampleExecuted;
-            pStatTotals->numSampleLExecuted += pStatUpdate->numSampleLExecuted;
-            pStatTotals->numSampleBExecuted += pStatUpdate->numSampleBExecuted;
-            pStatTotals->numSampleCExecuted += pStatUpdate->numSampleCExecuted;
-            pStatTotals->numSampleCLZExecuted += pStatUpdate->numSampleCLZExecuted;
-            pStatTotals->numSampleCDExecuted += pStatUpdate->numSampleCDExecuted;
-            pStatTotals->numGather4Executed += pStatUpdate->numGather4Executed;
-            pStatTotals->numGather4CExecuted += pStatUpdate->numGather4CExecuted;
-            pStatTotals->numGather4CPOExecuted += pStatUpdate->numGather4CPOExecuted;
-            pStatTotals->numGather4CPOCExecuted += pStatUpdate->numGather4CPOCExecuted;
-            pStatTotals->numLodExecuted += pStatUpdate->numLodExecuted;
-        }
-
-        virtual void Handle(const VSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_VERTEX], pStats);
-        }
-
-        virtual void Handle(const GSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_GEOMETRY], pStats);
-        }
-
-        virtual void Handle(const DSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_DOMAIN], pStats);
-        }
-
-        virtual void Handle(const HSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_HULL], pStats);
-        }
-
-        virtual void Handle(const PSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_PIXEL], pStats);
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const CSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_COMPUTE], pStats);
-            mNeedFlush = true;
-        }
-
-        // Flush cached events for this draw
-        virtual void FlushDraw(uint32_t drawId)
-        {
-            if (mNeedFlush == false)
-                return;
-
-            EventHandlerFile::Handle(PSInfo(drawId,
-                                            mShaderStats[SHADER_PIXEL].numInstExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleLExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleBExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleCExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleCDExecuted,
-                                            mShaderStats[SHADER_PIXEL].numGather4Executed,
-                                            mShaderStats[SHADER_PIXEL].numGather4CExecuted,
-                                            mShaderStats[SHADER_PIXEL].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_PIXEL].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_PIXEL].numLodExecuted));
-            EventHandlerFile::Handle(CSInfo(drawId,
-                                            mShaderStats[SHADER_COMPUTE].numInstExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleLExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleBExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleCExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleCDExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numGather4Executed,
-                                            mShaderStats[SHADER_COMPUTE].numGather4CExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numLodExecuted));
-
-            // singleSample
-            EventHandlerFile::Handle(EarlyZSingleSample(
-                drawId, mDSSingleSample.earlyZTestPassCount, mDSSingleSample.earlyZTestFailCount));
-            EventHandlerFile::Handle(LateZSingleSample(
-                drawId, mDSSingleSample.lateZTestPassCount, mDSSingleSample.lateZTestFailCount));
-            EventHandlerFile::Handle(
-                EarlyStencilSingleSample(drawId,
-                                         mDSSingleSample.earlyStencilTestPassCount,
-                                         mDSSingleSample.earlyStencilTestFailCount));
-            EventHandlerFile::Handle(
-                LateStencilSingleSample(drawId,
-                                        mDSSingleSample.lateStencilTestPassCount,
-                                        mDSSingleSample.lateStencilTestFailCount));
-
-            // sampleRate
-            EventHandlerFile::Handle(EarlyZSampleRate(
-                drawId, mDSSampleRate.earlyZTestPassCount, mDSSampleRate.earlyZTestFailCount));
-            EventHandlerFile::Handle(LateZSampleRate(
-                drawId, mDSSampleRate.lateZTestPassCount, mDSSampleRate.lateZTestFailCount));
-            EventHandlerFile::Handle(
-                EarlyStencilSampleRate(drawId,
-                                       mDSSampleRate.earlyStencilTestPassCount,
-                                       mDSSampleRate.earlyStencilTestFailCount));
-            EventHandlerFile::Handle(LateStencilSampleRate(drawId,
-                                                           mDSSampleRate.lateStencilTestPassCount,
-                                                           mDSSampleRate.lateStencilTestFailCount));
-
-            // combined
-            EventHandlerFile::Handle(
-                EarlyZ(drawId, mDSCombined.earlyZTestPassCount, mDSCombined.earlyZTestFailCount));
-            EventHandlerFile::Handle(
-                LateZ(drawId, mDSCombined.lateZTestPassCount, mDSCombined.lateZTestFailCount));
-            EventHandlerFile::Handle(EarlyStencil(drawId,
-                                                  mDSCombined.earlyStencilTestPassCount,
-                                                  mDSCombined.earlyStencilTestFailCount));
-            EventHandlerFile::Handle(LateStencil(drawId,
-                                                 mDSCombined.lateStencilTestPassCount,
-                                                 mDSCombined.lateStencilTestFailCount));
-
-            // pixelRate
-            EventHandlerFile::Handle(EarlyZPixelRate(
-                drawId, mDSPixelRate.earlyZTestPassCount, mDSPixelRate.earlyZTestFailCount));
-            EventHandlerFile::Handle(LateZPixelRate(
-                drawId, mDSPixelRate.lateZTestPassCount, mDSPixelRate.lateZTestFailCount));
-
-
-            // NullPS
-            EventHandlerFile::Handle(
-                EarlyZNullPS(drawId, mDSNullPS.earlyZTestPassCount, mDSNullPS.earlyZTestFailCount));
-            EventHandlerFile::Handle(EarlyStencilNullPS(
-                drawId, mDSNullPS.earlyStencilTestPassCount, mDSNullPS.earlyStencilTestFailCount));
-
-            // Rasterized Subspans
-            EventHandlerFile::Handle(RasterTiles(drawId, rastStats.rasterTiles));
-
-            // Alpha Subspans
-            EventHandlerFile::Handle(
-                AlphaEvent(drawId, mAlphaStats.alphaTestCount, mAlphaStats.alphaBlendCount));
-
-            // Primitive Culling
-            EventHandlerFile::Handle(
-                CullEvent(drawId, mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount));
-
-            mDSSingleSample = {};
-            mDSSampleRate   = {};
-            mDSCombined     = {};
-            mDSPixelRate    = {};
-            mDSNullPS = {};
-
-            rastStats   = {};
-            mCullStats  = {};
-            mAlphaStats = {};
-
-            mShaderStats[SHADER_PIXEL]   = {};
-            mShaderStats[SHADER_COMPUTE] = {};
-
-            mNeedFlush = false;
-        }
-
-        virtual void Handle(const FrontendDrawEndEvent& event)
-        {
-            // Clipper
-            EventHandlerFile::Handle(ClipperEvent(event.data.drawId,
-                                                  mClipper.trivialRejectCount,
-                                                  mClipper.trivialAcceptCount,
-                                                  mClipper.mustClipCount));
-
-            // Tesselator
-            EventHandlerFile::Handle(TessPrims(event.data.drawId, mTS.inputPrims));
-
-            // Geometry Shader
-            EventHandlerFile::Handle(GSInputPrims(event.data.drawId, mGS.inputPrimCount));
-            EventHandlerFile::Handle(GSPrimsGen(event.data.drawId, mGS.primGeneratedCount));
-            EventHandlerFile::Handle(GSVertsInput(event.data.drawId, mGS.vertsInput));
-
-            EventHandlerFile::Handle(VSInfo(event.data.drawId,
-                                            mShaderStats[SHADER_VERTEX].numInstExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleLExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleBExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleCExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleCDExecuted,
-                                            mShaderStats[SHADER_VERTEX].numGather4Executed,
-                                            mShaderStats[SHADER_VERTEX].numGather4CExecuted,
-                                            mShaderStats[SHADER_VERTEX].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_VERTEX].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_VERTEX].numLodExecuted));
-            EventHandlerFile::Handle(HSInfo(event.data.drawId,
-                                            mShaderStats[SHADER_HULL].numInstExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleLExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleBExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleCExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleCDExecuted,
-                                            mShaderStats[SHADER_HULL].numGather4Executed,
-                                            mShaderStats[SHADER_HULL].numGather4CExecuted,
-                                            mShaderStats[SHADER_HULL].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_HULL].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_HULL].numLodExecuted));
-            EventHandlerFile::Handle(DSInfo(event.data.drawId,
-                                            mShaderStats[SHADER_DOMAIN].numInstExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleLExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleBExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleCExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleCDExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numGather4Executed,
-                                            mShaderStats[SHADER_DOMAIN].numGather4CExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numLodExecuted));
-            EventHandlerFile::Handle(GSInfo(event.data.drawId,
-                                            mShaderStats[SHADER_GEOMETRY].numInstExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleLExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleBExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleCExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleCDExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numGather4Executed,
-                                            mShaderStats[SHADER_GEOMETRY].numGather4CExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numLodExecuted));
-
-            mShaderStats[SHADER_VERTEX]   = {};
-            mShaderStats[SHADER_HULL]     = {};
-            mShaderStats[SHADER_DOMAIN]   = {};
-            mShaderStats[SHADER_GEOMETRY] = {};
-
-            // Reset Internal Counters
-            mClipper = {};
-            mTS      = {};
-            mGS      = {};
-        }
-
-        virtual void Handle(const GSPrimInfo& event)
-        {
-            mGS.inputPrimCount += event.data.inputPrimCount;
-            mGS.primGeneratedCount += event.data.primGeneratedCount;
-            mGS.vertsInput += event.data.vertsInput;
-        }
-
-        virtual void Handle(const TessPrimCount& event) { mTS.inputPrims += event.data.primCount; }
-
-        virtual void Handle(const RasterTileCount& event)
-        {
-            rastStats.rasterTiles += event.data.rasterTiles;
-        }
-
-        virtual void Handle(const CullInfoEvent& event)
-        {
-            mCullStats.degeneratePrimCount += _mm_popcnt_u32(
-                event.data.validMask ^ (event.data.validMask & ~event.data.degeneratePrimMask));
-            mCullStats.backfacePrimCount += _mm_popcnt_u32(
-                event.data.validMask ^ (event.data.validMask & ~event.data.backfacePrimMask));
-        }
-
-        virtual void Handle(const AlphaInfoEvent& event)
-        {
-            mAlphaStats.alphaTestCount += event.data.alphaTestEnable;
-            mAlphaStats.alphaBlendCount += event.data.alphaBlendEnable;
-        }
-
-    protected:
-        bool mNeedFlush;
-        // Per draw stats
-        DepthStencilStats mDSSingleSample = {};
-        DepthStencilStats mDSSampleRate   = {};
-        DepthStencilStats mDSPixelRate    = {};
-        DepthStencilStats mDSCombined     = {};
-        DepthStencilStats mDSNullPS       = {};
-        DepthStencilStats mDSOmZ          = {};
-        CStats            mClipper        = {};
-        TEStats           mTS             = {};
-        GSStateInfo       mGS             = {};
-        RastStats         rastStats       = {};
-        CullStats         mCullStats      = {};
-        AlphaStats        mAlphaStats     = {};
-
-        SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES];
-
-    };
-
-    static EventManager* FromHandle(HANDLE hThreadContext)
-    {
-        return reinterpret_cast<EventManager*>(hThreadContext);
-    }
-
-    // Construct an event manager and associate a handler with it.
-    HANDLE CreateThreadContext(AR_THREAD type)
-    {
-        // Can we assume single threaded here?
-        static std::atomic<uint32_t> counter(0);
-        uint32_t                     id = counter.fetch_add(1);
-
-        EventManager* pManager = new EventManager();
-
-        if (pManager)
-        {
-            EventHandlerFile* pHandler = nullptr;
-
-            if (type == AR_THREAD::API)
-            {
-                pHandler = new EventHandlerApiStats(id);
-                pManager->Attach(pHandler);
-                pHandler->Handle(ThreadStartApiEvent());
-            }
-            else
-            {
-                pHandler = new EventHandlerWorkerStats(id);
-                pManager->Attach(pHandler);
-                pHandler->Handle(ThreadStartWorkerEvent());
-            }
-
-            pHandler->MarkHeader();
-
-            return pManager;
-        }
-
-        SWR_INVALID("Failed to register thread.");
-        return nullptr;
-    }
-
-    void DestroyThreadContext(HANDLE hThreadContext)
-    {
-        EventManager* pManager = FromHandle(hThreadContext);
-        SWR_ASSERT(pManager != nullptr);
-
-        delete pManager;
-    }
-
-    // Dispatch event for this thread.
-    void Dispatch(HANDLE hThreadContext, const Event& event)
-    {
-        if (event.IsEnabled())
-        {
-            EventManager* pManager = reinterpret_cast<EventManager*>(hThreadContext);
-            SWR_ASSERT(pManager != nullptr);
-            pManager->Dispatch(event);
-        }
-    }
-
-    // Flush for this thread.
-    void FlushDraw(HANDLE hThreadContext, uint32_t drawId)
-    {
-        EventManager* pManager = FromHandle(hThreadContext);
-        SWR_ASSERT(pManager != nullptr);
-
-        pManager->FlushDraw(drawId);
-    }
-} // namespace ArchRast
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
deleted file mode 100644
index a247443f54b..00000000000
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file archrast.h
- *
- * @brief Definitions for archrast.
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/os.h"
-#include "gen_ar_event.hpp"
-#include "eventmanager.h"
-
-namespace ArchRast
-{
-    enum class AR_THREAD
-    {
-        API    = 0,
-        WORKER = 1
-    };
-
-    HANDLE CreateThreadContext(AR_THREAD type);
-    void   DestroyThreadContext(HANDLE hThreadContext);
-
-    // Dispatch event for this thread.
-    void Dispatch(HANDLE hThreadContext, const Event& event);
-
-    void FlushDraw(HANDLE hThreadContext, uint32_t drawId);
-}; // namespace ArchRast
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h b/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h
deleted file mode 100644
index 118a100e850..00000000000
--- a/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file archrast.h
- *
- * @brief Definitions for the event manager.
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/os.h"
-
-#include "gen_ar_event.hpp"
-#include "gen_ar_eventhandler.hpp"
-
-#include <vector>
-
-namespace ArchRast
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// EventManager - interface to dispatch events to handlers.
-    /// Event handling occurs only on a single thread.
-    //////////////////////////////////////////////////////////////////////////
-    class EventManager
-    {
-    public:
-        EventManager() {}
-
-        ~EventManager()
-        {
-            // Event manager owns destroying handler objects once attached.
-            ///@note See comment for Detach.
-            for (auto pHandler : mHandlers)
-            {
-                delete pHandler;
-            }
-        }
-
-        void Attach(EventHandler* pHandler)
-        {
-            SWR_ASSERT(pHandler != nullptr);
-            mHandlers.push_back(pHandler);
-        }
-
-        void Dispatch(const Event& event)
-        {
-            ///@todo Add event filter check here.
-
-            for (auto pHandler : mHandlers)
-            {
-                event.Accept(pHandler);
-            }
-        }
-
-        void FlushDraw(uint32_t drawId)
-        {
-            for (auto pHandler : mHandlers)
-            {
-                pHandler->FlushDraw(drawId);
-            }
-        }
-
-    private:
-        // Handlers stay registered for life
-        void Detach(EventHandler* pHandler) { SWR_INVALID("Should not be called"); }
-
-        std::vector<EventHandler*> mHandlers;
-    };
-}; // namespace ArchRast
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/events.proto b/src/gallium/drivers/swr/rasterizer/archrast/events.proto
deleted file mode 100644
index 24739293a30..00000000000
--- a/src/gallium/drivers/swr/rasterizer/archrast/events.proto
+++ /dev/null
@@ -1,427 +0,0 @@
-# Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-#
-# Provides definitions for events.
-
-enum AR_DRAW_TYPE
-{
-    Instanced = 0,
-    IndexedInstanced = 1,
-    InstancedSplit = 2,
-    IndexedInstancedSplit = 3
-};
-
-event Framework::ThreadStartApiEvent
-{
-};
-
-event Framework::ThreadStartWorkerEvent
-{
-};
-
-///@brief Used as a helper event to indicate end of frame. Does not guarantee to capture end of frame on all APIs
-event ApiSwr::FrameEndEvent
-{
-    uint32_t frameId;       // current frame id
-    uint32_t nextDrawId;    // next draw id (always incremental - does not reset)
-};
-
-///@brief Synchronization event.
-event ApiSwr::SwrSyncEvent
-{
-    uint32_t drawId;
-};
-
-///@brief Invalidate hot tiles (i.e. tile cache)
-event ApiSwr::SwrInvalidateTilesEvent
-{
-    uint32_t drawId;
-};
-
-///@brief Invalidate and discard hot tiles within pixel region
-event ApiSwr::SwrDiscardRectEvent
-{
-    uint32_t drawId;
-};
-
-///@brief Flush tiles out to memory that is typically owned by driver (e.g. Flush RT cache)
-event ApiSwr::SwrStoreTilesEvent
-{
-    uint32_t drawId;
-};
-
-event PipelineStats::DrawInfoEvent
-{
-    uint32_t drawId;
-    AR_DRAW_TYPE type;  // type of draw (indexed, instanced, etc)
-    uint32_t topology;  // topology of draw
-    uint32_t numVertices; // number of vertices for draw
-    uint32_t numIndices; // number of indices for draw
-    int32_t  indexOffset; // offset into index buffer
-    int32_t  baseVertex; // which vertex to start with
-    uint32_t numInstances; // number of instances to draw
-    uint32_t startInstance; // which instance to start fetching
-    uint32_t tsEnable; // tesselation enabled
-    uint32_t gsEnable; // geometry shader enabled
-    uint32_t soEnable; // stream-out enabled
-    uint32_t soTopology; // topology of stream-out
-    uint32_t splitId; // split draw count or id
-};
-
-event PipelineStats::DispatchEvent
-{
-    uint32_t drawId;
-    uint32_t threadGroupCountX; // num thread groups in X dimension
-    uint32_t threadGroupCountY; // num thread groups in Y dimension
-    uint32_t threadGroupCountZ; // num thread groups in Z dimension
-};
-
-event PipelineStats::FrontendStatsEvent
-{
-    uint32_t drawId;
-    uint64_t IaVertices;
-    uint64_t IaPrimitives;
-    uint64_t VsInvocations;
-    uint64_t HsInvocations;
-    uint64_t DsInvocations;
-    uint64_t GsInvocations;
-    uint64_t GsPrimitives;
-    uint64_t CInvocations;
-    uint64_t CPrimitives;
-    uint64_t SoPrimStorageNeeded0;
-    uint64_t SoPrimStorageNeeded1;
-    uint64_t SoPrimStorageNeeded2;
-    uint64_t SoPrimStorageNeeded3;
-    uint64_t SoNumPrimsWritten0;
-    uint64_t SoNumPrimsWritten1;
-    uint64_t SoNumPrimsWritten2;
-    uint64_t SoNumPrimsWritten3;
-};
-
-event PipelineStats::BackendStatsEvent
-{
-    uint32_t drawId;
-    uint64_t DepthPassCount;
-    uint64_t PsInvocations;
-    uint64_t CsInvocations;
-
-};
-
-event PipelineStats::EarlyZSingleSample
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateZSingleSample
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyStencilSingleSample
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateStencilSingleSample
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyZSampleRate
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateZSampleRate
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyStencilSampleRate
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateStencilSampleRate
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-// Total Early-Z counts, SingleSample and SampleRate
-event PipelineStats::EarlyZ
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-// Total LateZ counts, SingleSample and SampleRate
-event PipelineStats::LateZ
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-// Total EarlyStencil counts, SingleSample and SampleRate
-event PipelineStats::EarlyStencil
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-// Total LateStencil counts, SingleSample and SampleRate
-event PipelineStats::LateStencil
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyZNullPS
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyStencilNullPS
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyZPixelRate
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateZPixelRate
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-
-event PipelineStats::EarlyOmZ
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyOmStencil
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateOmZ
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateOmStencil
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::GSInputPrims
-{
-    uint32_t drawId;
-    uint64_t inputPrimCount;
-};
-
-event PipelineStats::GSPrimsGen
-{
-    uint32_t drawId;
-    uint64_t primGeneratedCount;
-};
-
-event PipelineStats::GSVertsInput
-{
-    uint32_t drawId;
-    uint64_t vertsInput;
-};
-
-event PipelineStats::TessPrims
-{
-    uint32_t drawId;
-    uint64_t primCount;
-};
-
-event PipelineStats::RasterTiles
-{
-    uint32_t drawId;
-    uint32_t rastTileCount;
-};
-
-event PipelineStats::ClipperEvent
-{
-    uint32_t drawId;
-    uint32_t trivialRejectCount;
-    uint32_t trivialAcceptCount;
-    uint32_t mustClipCount;
-};
-
-event PipelineStats::CullEvent
-{
-    uint32_t drawId;
-    uint64_t backfacePrimCount;
-    uint64_t degeneratePrimCount;
-};
-
-event PipelineStats::AlphaEvent
-{
-    uint32_t drawId;
-    uint32_t alphaTestCount;
-    uint32_t alphaBlendCount;
-};
-
-event ShaderStats::VSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
-event ShaderStats::HSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
-event ShaderStats::DSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
-event ShaderStats::GSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-
-};
-
-event ShaderStats::PSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
-event ShaderStats::CSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto b/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto
deleted file mode 100644
index b57d5c4284f..00000000000
--- a/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-#
-# Provides definitions for private internal events that are only used internally
-# to rasty for communicating information between Rasty and Archrast. One goal for
-# ArchRast is to not pollute the Rasty code with lots of calculations, etc. that
-# are needed to compute per draw statistics, etc.
-
-event PipelineStats::EarlyDepthStencilInfoSingleSample
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::EarlyDepthStencilInfoSampleRate
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::EarlyDepthStencilInfoNullPS
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::LateDepthStencilInfoSingleSample
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::LateDepthStencilInfoSampleRate
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::LateDepthStencilInfoNullPS
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::EarlyDepthInfoPixelRate
-{
-    uint64_t depthPassCount;
-    uint64_t activeLanes;
-};
-
-
-event PipelineStats::LateDepthInfoPixelRate
-{
-    uint64_t depthPassCount;
-    uint64_t activeLanes;
-};
-
-
-event PipelineStats::BackendDrawEndEvent
-{
-    uint32_t drawId;
-};
-
-event PipelineStats::FrontendDrawEndEvent
-{
-    uint32_t drawId;
-};
-
-event Memory::MemoryAccessEvent
-{
-    uint32_t drawId;
-    uint64_t tsc;
-    uint64_t ptr;
-    uint32_t size;
-    uint8_t isRead;
-    uint8_t client;
-};
-
-event Memory::MemoryStatsEndEvent
-{
-    uint32_t drawId;
-};
-
-event PipelineStats::TessPrimCount
-{
-    uint64_t primCount;
-};
-
-event PipelineStats::RasterTileCount
-{
-    uint32_t drawId;
-    uint64_t rasterTiles;
-};
-
-event PipelineStats::GSPrimInfo
-{
-    uint64_t inputPrimCount;
-    uint64_t primGeneratedCount;
-    uint64_t vertsInput;
-};
-
-// validMask is primitives that still need to be clipped. They weren't rejected due to trivial reject or nan.
-// clipMask is primitives that need to be clipped. So trivial accepts will be 0 while validMask for that is 1.
-// Trivial reject is numInvocations - pop_cnt32(validMask)
-// Trivial accept is validMask & ~clipMask
-// Must clip count is pop_cnt32(clipMask)
-event PipelineStats::ClipInfoEvent
-{
-    uint32_t numInvocations;
-    uint32_t validMask;
-    uint32_t clipMask;
-};
-
-event PipelineStats::CullInfoEvent
-{
-    uint32_t drawId;
-    uint64_t degeneratePrimMask;
-    uint64_t backfacePrimMask;
-    uint32_t validMask;
-};
-
-event PipelineStats::AlphaInfoEvent
-{
-    uint32_t drawId;
-    uint32_t alphaTestEnable;
-    uint32_t alphaBlendEnable;
-};
-
-event PipelineStats::DrawInstancedEvent
-{
-    uint32_t drawId;
-    uint32_t topology;
-    uint32_t numVertices;
-    int32_t  startVertex;
-    uint32_t numInstances;
-    uint32_t startInstance;
-    uint32_t tsEnable;
-    uint32_t gsEnable;
-    uint32_t soEnable;
-    uint32_t soTopology;
-    uint32_t splitId; // Split draw count or id.
-};
-
-event PipelineStats::DrawIndexedInstancedEvent
-{
-    uint32_t drawId;
-    uint32_t topology;
-    uint32_t numIndices;
-    int32_t  indexOffset;
-    int32_t  baseVertex;
-    uint32_t numInstances;
-    uint32_t startInstance;
-    uint32_t tsEnable;
-    uint32_t gsEnable;
-    uint32_t soEnable;
-    uint32_t soTopology;
-    uint32_t splitId; // Split draw count or id.
-};
-
-event ShaderStats::VSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-
-event ShaderStats::HSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-
-event ShaderStats::DSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-
-event ShaderStats::GSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-
-event ShaderStats::PSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-
-event ShaderStats::CSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py
deleted file mode 100644
index a4be675a34c..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-# Python source
-import os
-import sys
-import re
-from gen_common import *
-
-def parse_event_fields(lines, idx, event_dict):
-    """
-        Parses lines from a proto file that contain an event definition and stores it in event_dict
-    """
-    fields = []
-    end_of_event = False
-
-    # record all fields in event definition.
-    # note: we don't check if there's a leading brace.
-    while not end_of_event and idx < len(lines):
-        line = lines[idx].rstrip()
-        idx += 1
-
-        # ex 1: uint32_t    numSampleCLZExecuted; // number of sample_cl_z instructions executed
-        # ex 2: char        reason[256]; // size of reason
-        match = re.match(r'^(\s*)([\w\*]+)(\s+)([\w]+)(\[\d+\])*;\s*(\/\/.*)*$', line)
-        # group 1 -
-        # group 2 type
-        # group 3 -
-        # group 4 name
-        # group 5 [array size]
-        # group 6 //comment
-
-        if match:
-            field = {
-                "type": match.group(2),
-                "name": match.group(4),
-                "size": int(match.group(5)[1:-1]) if match.group(5) else 1,
-                "desc": match.group(6)[2:].strip() if match.group(6) else "",
-            }
-            fields.append(field)
-
-        end_of_event = re.match(r'(\s*)};', line)
-
-    event_dict['fields'] = fields
-    event_dict['num_fields'] = len(fields)
-
-    return idx
-
-def parse_enums(lines, idx, event_dict):
-    """
-        Parses lines from a proto file that contain an enum definition and stores it in event_dict
-    """
-    enum_names = []
-    end_of_enum = False
-
-    # record all enum values in enumeration
-    # note: we don't check if there's a leading brace.
-    while not end_of_enum and idx < len(lines):
-        line = lines[idx].rstrip()
-        idx += 1
-
-        preprocessor = re.search(r'#if|#endif', line)
-
-        if not preprocessor:
-            enum = re.match(r'(\s*)(\w+)(\s*)', line)
-
-            if enum:
-                enum_names.append(line)
-
-            end_of_enum = re.match(r'(\s*)};', line)
-
-    event_dict['names'] = enum_names
-    return idx
-
-def parse_protos(files, verbose=False):
-    """
-        Parses a proto file and returns a dictionary of event definitions
-    """
-
-    # Protos structure:
-    #
-    # {
-    #   "events": {
-    #     "defs": {     // dict of event definitions where keys are 'group_name::event_name"
-    #       ...,
-    #       "ApiStat::DrawInfoEvent": {
-    #         "id": 3,
-    #         "group": "ApiStat",
-    #         "name": "DrawInfoEvent",  // name of event without 'group_name::' prefix
-    #         "desc": "",
-    #         "fields": [
-    #           {
-    #             "type": "uint32_t",
-    #             "name": "drawId",
-    #             "size": 1,
-    #             "desc": "",
-    #           },
-    #           ...
-    #         ]
-    #       },
-    #       ...
-    #     },
-    #     "groups": {   // dict of groups with lists of event keys
-    #       "ApiStat": [
-    #         "ApiStat::DispatchEvent",
-    #         "ApiStat::DrawInfoEvent",
-    #         ...
-    #       ],
-    #       "Framework": [
-    #         "Framework::ThreadStartApiEvent",
-    #         "Framework::ThreadStartWorkerEvent",
-    #         ...
-    #       ],
-    #       ...
-    #     },
-    #     "map": {  // map of event ids to match archrast output to event key
-    #       "1": "Framework::ThreadStartApiEvent",
-    #       "2": "Framework::ThreadStartWorkerEvent",
-    #       "3": "ApiStat::DrawInfoEvent",
-    #       ...
-    #     }
-    #   },
-    #   "enums": { ... }    // enums follow similar defs, map (groups?) structure
-    # }
-
-    protos = {
-        'events': {
-            'defs': {},             # event dictionary containing events with their fields
-            'map': {},              # dictionary to map event ids to event names
-            'groups': {}            # event keys stored by groups
-        },
-        'enums': {
-            'defs': {},
-            'map': {}
-        }
-    }
-
-    event_id = 0
-    enum_id = 0
-
-    if type(files) is not list:
-        files = [files]
-
-    for filename in files:
-        if verbose:
-            print("Parsing proto file: %s" % os.path.normpath(filename))
-
-        with open(filename, 'r') as f:
-            lines = f.readlines()
-            in_brief = False
-            brief = []
-            idx = 0
-            while idx < len(lines):
-                line = lines[idx].strip()
-                idx += 1
-
-                # If currently processing a brief, keep processing or change state
-                if in_brief:
-                    match = re.match(r'^\s*\/\/\/\s*(.*)$', line)                   # i.e. "/// more event desc..."
-                    if match:
-                        brief.append(match.group(1).strip())
-                        continue
-                    else:
-                        in_brief = False
-
-                # Match event/enum brief
-                match = re.match(r'^\s*\/\/\/\s*@(brief|breif)\s*(.*)$', line)       # i.e. "///@brief My event desc..."
-                if match:
-                    in_brief = True
-                    brief.append(match.group(2).strip())
-                    continue
-
-                # Match event definition
-                match = re.match(r'event(\s*)(((\w*)::){0,1}(\w+))', line)          # i.e. "event SWTag::CounterEvent"
-                if match:
-                    event_id += 1
-
-                    # Parse event attributes
-                    event_key = match.group(2)                                      # i.e. SWTag::CounterEvent
-                    event_group = match.group(4) if match.group(4) else ""          # i.e. SWTag
-                    event_name = match.group(5)                                     # i.e. CounterEvent
-
-                    # Define event attributes
-                    event = {
-                        'id': event_id,
-                        'group': event_group,
-                        'name': event_name,
-                        'desc': ' '.join(brief)
-                    }
-                    # Add period at end of event desc if necessary
-                    if event["desc"] and event["desc"][-1] != '.':
-                        event["desc"] += '.'
-
-                    # Reset brief
-                    brief = []
-
-                    # Now add event fields
-                    idx = parse_event_fields(lines, idx, event)
-
-                    # Register event and mapping
-                    protos['events']['defs'][event_key] = event
-                    protos['events']['map'][event_id] = event_key
-
-                    continue
-
-                # Match enum definition
-                match = re.match(r'enum(\s*)(\w+)', line)
-                if match:
-                    enum_id += 1
-
-                    # Parse enum attributes
-                    enum_name = match.group(2)
-
-                    # Define enum attr
-                    enum = {
-                        'name': enum_name,
-                        'desc': ' '.join(brief)
-                    }
-                    # Add period at end of event desc if necessary
-                    if enum["desc"] and enum["desc"][-1] != '.':
-                        enum["desc"] += '.'
-
-                    # Reset brief
-                    brief = []
-
-                    # Now add enum fields
-                    idx = parse_enums(lines, idx, enum)
-
-                    # Register enum and mapping
-                    protos['enums']['defs'][enum_name] = enum
-                    protos['enums']['map'][enum_id] = enum_name
-
-                    continue
-
-    # Sort and group events
-    event_groups = protos['events']['groups']
-    for key in sorted(protos['events']['defs']):
-        group = protos['events']['defs'][key]['group']
-        if group not in event_groups:
-            event_groups[group] = []
-        event_groups[group].append(key)
-
-    return protos
-
-
-def main():
-
-    # Parse args...
-    parser = ArgumentParser()
-    parser.add_argument("--proto", "-p", dest="protos", nargs='+', help="Path to all proto file(s) to process. Accepts one or more paths (i.e. events.proto and events_private.proto)", required=True)
-    parser.add_argument("--output-dir", help="Output dir (defaults to ./codegen). Will create folder if it does not exist.", required=False, default="codegen")
-    parser.add_argument("--verbose", "-v", help="Verbose", action="store_true")
-    args = parser.parse_args()
-
-    if not os.path.exists(args.output_dir):
-        MakeDir(args.output_dir)
-
-    for f in args.protos:
-        if not os.path.exists(f):
-            print('Error: Could not find proto file %s' % f, file=sys.stderr)
-            return 1
-
-    # Parse each proto file and add to protos container
-    protos = parse_protos(args.protos, args.verbose)
-
-    files = [
-        ["gen_ar_event.hpp", ""],
-        ["gen_ar_event.cpp", ""],
-        ["gen_ar_eventhandler.hpp", "gen_ar_event.hpp"],
-        ["gen_ar_eventhandlerfile.hpp", "gen_ar_eventhandler.hpp"]
-    ]
-
-    rval = 0
-
-    try:
-        # Delete existing files
-        for f in files:
-            filename = f[0]
-            output_fullpath = os.path.join(args.output_dir, filename)
-            if os.path.exists(output_fullpath):
-                if args.verbose:
-                    print("Deleting existing file: %s" % output_fullpath)
-                os.remove(output_fullpath)
-
-        # Generate files from templates
-        print("Generating c++ from proto files...")
-        for f in files:
-            filename = f[0]
-            event_header = f[1]
-            curdir = os.path.dirname(os.path.abspath(__file__))
-            template_file = os.path.join(curdir, 'templates', filename)
-            output_fullpath = os.path.join(args.output_dir, filename)
-
-            if args.verbose:
-                print("Generating: %s" % output_fullpath)
-            MakoTemplateWriter.to_file(template_file, output_fullpath,
-                    cmdline=sys.argv,
-                    filename=filename,
-                    protos=protos,
-                    event_header=event_header)
-
-    except Exception as e:
-        print(e)
-        rval = 1
-
-    return rval
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
deleted file mode 100644
index eb51a3a8a13..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (C) 2017-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the 'Software'),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-# Python source
-
-import itertools
-import os
-import sys
-from gen_common import *
-
-
-def main(args=sys.argv[1:]):
-    thisDir = os.path.dirname(os.path.realpath(__file__))
-    parser = ArgumentParser('Generate files and initialization functions for all permutations of BackendPixelRate.')
-    parser.add_argument('--dim', help='gBackendPixelRateTable array dimensions', nargs='+', type=int, required=True)
-    parser.add_argument('--outdir', help='output directory', nargs='?', type=str, default=thisDir)
-    parser.add_argument('--split', help='how many lines of initialization per file [0=no split]', nargs='?', type=int, default='512')
-    parser.add_argument('--numfiles', help='how many output files to generate', nargs='?', type=int, default='0')
-    parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False)
-    parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False)
-    parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False)
-    parser.add_argument('--rast', help='Generate rasterizer functions instead of normal backend', action='store_true', default=False)
-
-    args = parser.parse_args(args)
-
-
-    class backendStrs :
-        def __init__(self) :
-            self.outFileName = 'gen_BackendPixelRate%s.cpp'
-            self.outHeaderName = 'gen_BackendPixelRate.hpp'
-            self.functionTableName = 'gBackendPixelRateTable'
-            self.funcInstanceHeader = ' = BackendPixelRate<SwrBackendTraits<'
-            self.template = 'gen_backend.cpp'
-            self.hpp_template = 'gen_header_init.hpp'
-            self.cmakeFileName = 'gen_backends.cmake'
-            self.cmakeSrcVar = 'GEN_BACKEND_SOURCES'
-            self.tableName = 'BackendPixelRate'
-
-            if args.rast:
-                self.outFileName = 'gen_rasterizer%s.cpp'
-                self.outHeaderName = 'gen_rasterizer.hpp'
-                self.functionTableName = 'gRasterizerFuncs'
-                self.funcInstanceHeader = ' = RasterizeTriangle<RasterizerTraits<'
-                self.template = 'gen_rasterizer.cpp'
-                self.cmakeFileName = 'gen_rasterizer.cmake'
-                self.cmakeSrcVar = 'GEN_RASTERIZER_SOURCES'
-                self.tableName = 'RasterizerFuncs'
-
-
-    backend = backendStrs()
-
-    output_list = []
-    for x in args.dim:
-        output_list.append(list(range(x)))
-
-    # generate all permutations possible for template parameter inputs
-    output_combinations = list(itertools.product(*output_list))
-    output_list = []
-
-    # for each permutation
-    for x in range(len(output_combinations)):
-        # separate each template peram into its own list member
-        new_list = [output_combinations[x][i] for i in range(len(output_combinations[x]))]
-        tempStr = backend.functionTableName
-        #print each list member as an index in the multidimensional array
-        for i in new_list:
-            tempStr += '[' + str(i) + ']'
-        #map each entry in the permutation as its own string member, store as the template instantiation string
-        tempStr += backend.funcInstanceHeader + ','.join(map(str, output_combinations[x])) + '>>;'
-        #append the line of c++ code in the list of output lines
-        output_list.append(tempStr)
-
-    # how many files should we split the global template initialization into?
-    if (args.split == 0):
-        numFiles = 1
-    else:
-        numFiles = (len(output_list) + args.split - 1) // args.split
-    if (args.numfiles != 0):
-        numFiles = args.numfiles
-    linesPerFile = (len(output_list) + numFiles - 1) // numFiles
-    chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)]
-
-    tmp_output_dir = MakeTmpDir('_codegen')
-
-    if not os.path.exists(args.outdir):
-        try:
-            os.makedirs(args.outdir)
-        except OSError as err:
-            if err.errno != errno.EEXIST:
-                print('ERROR: Could not create directory:', args.outdir, file=sys.stderr)
-                return 1
-
-    rval = 0
-
-    # generate .cpp files
-    try:
-        if args.cpp:
-            baseCppName = os.path.join(tmp_output_dir, backend.outFileName)
-            templateCpp = os.path.join(thisDir, 'templates', backend.template)
-
-            for fileNum in range(numFiles):
-                filename = baseCppName % str(fileNum)
-                MakoTemplateWriter.to_file(
-                    templateCpp,
-                    baseCppName % str(fileNum),
-                    cmdline=sys.argv,
-                    fileNum=fileNum,
-                    funcList=chunkedList[fileNum])
-
-        if args.hpp:
-            baseHppName = os.path.join(tmp_output_dir, backend.outHeaderName)
-            templateHpp = os.path.join(thisDir, 'templates', backend.hpp_template)
-
-            MakoTemplateWriter.to_file(
-                templateHpp,
-                baseHppName,
-                cmdline=sys.argv,
-                numFiles=numFiles,
-                filename=backend.outHeaderName,
-                tableName=backend.tableName)
-
-        # generate gen_backend.cmake file
-        if args.cmake:
-            templateCmake = os.path.join(thisDir, 'templates', 'gen_backend.cmake')
-            cmakeFile = os.path.join(tmp_output_dir, backend.cmakeFileName)
-
-            MakoTemplateWriter.to_file(
-                templateCmake,
-                cmakeFile,
-                cmdline=sys.argv,
-                srcVar=backend.cmakeSrcVar,
-                numFiles=numFiles,
-                baseCppName='${RASTY_GEN_SRC_DIR}/backends/' + os.path.basename(baseCppName))
-
-        rval = CopyDirFilesIfDifferent(tmp_output_dir, args.outdir)
-
-    except:
-        rval = 1
-
-    finally:
-        DeleteDirTree(tmp_output_dir)
-
-    return rval
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
deleted file mode 100644
index c1d08fb83bc..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-# Python source
-import os
-import errno
-import sys
-import argparse
-import tempfile
-import filecmp
-import shutil
-from mako.template import Template
-from mako.exceptions import RichTraceback
-
-#==============================================================================
-def ConcatLists(list_of_lists):
-    output = []
-    for l in list_of_lists: output += l
-    return output
-
-#==============================================================================
-def MakeTmpDir(suffix=''):
-    '''
-        Create temporary directory for use in codegen scripts.
-    '''
-    return tempfile.mkdtemp(suffix)
-
-#==============================================================================
-def MakeDir(dir_path):
-    '''
-        Create a directory if it doesn't exist
-
-        returns 0 on success, non-zero on failure
-    '''
-    dir_path = os.path.abspath(dir_path)
-
-    if not os.path.exists(dir_path):
-        try:
-            os.makedirs(dir_path)
-        except OSError as err:
-            if err.errno != errno.EEXIST:
-                return 1
-    else:
-        if not os.path.isdir(dir_path):
-            return 1
-
-    return 0
-
-#==============================================================================
-def DeleteDirTree(dir_path):
-    '''
-        Delete directory tree.
-
-        returns 0 on success, non-zero on failure
-    '''
-    rval = 0
-    try:
-        shutil.rmtree(dir_path, False)
-    except:
-        rval = 1
-    return rval
-
-#==============================================================================
-def CopyFileIfDifferent(src, dst, verbose = False):
-    '''
-        Copy <src> file to <dst> file if the <dst>
-        file either doesn't contain the file or the file
-        contents are different.
-
-        returns 0 on success, non-zero on failure
-    '''
-
-    assert os.path.isfile(src)
-    assert (False == os.path.exists(dst) or os.path.isfile(dst))
-
-    need_copy = not os.path.exists(dst)
-    if not need_copy:
-        need_copy = not filecmp.cmp(src, dst)
-
-    if need_copy:
-        try:
-            shutil.copy2(src, dst)
-        except:
-            print('ERROR: Could not copy %s to %s' % (src, dst), file=sys.stderr)
-            return 1
-
-        if verbose:
-            print(src, '-->', dst)
-
-    return 0
-
-#==============================================================================
-def CopyDirFilesIfDifferent(src, dst, recurse = True, verbose = False, orig_dst = None):
-    '''
-        Copy files <src> directory to <dst> directory if the <dst>
-        directory either doesn't contain the file or the file
-        contents are different.
-
-        Optionally recurses into subdirectories
-
-        returns 0 on success, non-zero on failure
-    '''
-
-    assert os.path.isdir(src)
-    assert os.path.isdir(dst)
-
-    src = os.path.abspath(src)
-    dst = os.path.abspath(dst)
-
-    if not orig_dst:
-        orig_dst = dst
-
-    for f in os.listdir(src):
-        src_path = os.path.join(src, f)
-        dst_path = os.path.join(dst, f)
-
-        # prevent recursion
-        if src_path == orig_dst:
-            continue
-
-        if os.path.isdir(src_path):
-            if recurse:
-                if MakeDir(dst_path):
-                    print('ERROR: Could not create directory:', dst_path, file=sys.stderr)
-                    return 1
-
-                if verbose:
-                    print('mkdir', dst_path)
-                rval = CopyDirFilesIfDifferent(src_path, dst_path, recurse, verbose, orig_dst)
-        else:
-            rval = CopyFileIfDifferent(src_path, dst_path, verbose)
-
-        if rval:
-            return rval
-
-    return 0
-
-#==============================================================================
-class MakoTemplateWriter:
-    '''
-        MakoTemplateWriter - Class (namespace) for functions to generate strings
-        or files using the Mako template module.
-
-        See http://docs.makotemplates.org/en/latest/ for
-        mako documentation.
-   '''
-    
-    @staticmethod
-    def to_string(template_filename, **kwargs):
-        '''
-            Write template data to a string object and return the string
-        '''
-        from mako.template      import Template
-        from mako.exceptions    import RichTraceback
-
-        try:
-            template = Template(filename=template_filename)
-            # Split + Join fixes line-endings for whatever platform you are using
-            return '\n'.join(template.render(**kwargs).splitlines())
-        except:
-            traceback = RichTraceback()
-            for (filename, lineno, function, line) in traceback.traceback:
-                print('File %s, line %s, in %s' % (filename, lineno, function))
-                print(line, '\n')
-            print('%s: %s' % (str(traceback.error.__class__.__name__), traceback.error))
-            raise
-
-    @staticmethod
-    def to_file(template_filename, output_filename, **kwargs):
-        '''
-            Write template data to a file
-        '''
-        if MakeDir(os.path.dirname(output_filename)):
-            return 1
-        with open(output_filename, 'w') as outfile:
-            print(MakoTemplateWriter.to_string(template_filename, **kwargs), file=outfile)
-        return 0
-
-
-#==============================================================================
-class ArgumentParser(argparse.ArgumentParser):
-    '''
-    Subclass of argparse.ArgumentParser
-
-    Allow parsing from command files that start with @
-    Example:
-      >bt run @myargs.txt
-    
-    Contents of myargs.txt:
-      -m <machine>
-      --target cdv_win7
-    
-    The below function allows multiple args to be placed on the same text-file line.
-    The default is one token per line, which is a little cumbersome.
-    
-    Also allow all characters after a '#' character to be ignored.
-    '''
-    
-    #==============================================================================
-    class _HelpFormatter(argparse.RawTextHelpFormatter):
-        ''' Better help formatter for argument parser '''
-
-        def _split_lines(self, text, width):
-            ''' optimized split lines algorithm, indents split lines '''
-            lines = text.splitlines()
-            out_lines = []
-            if len(lines):
-                out_lines.append(lines[0])
-                for line in lines[1:]:
-                    out_lines.append('  ' + line)
-            return out_lines
-
-    #==============================================================================
-    def __init__(self, *args, **kwargs):
-        ''' Constructor.  Compatible with argparse.ArgumentParser(),
-            but with some modifications for better usage and help display.
-        '''
-        super(ArgumentParser, self).__init__(
-                *args,
-                fromfile_prefix_chars='@',
-                formatter_class=ArgumentParser._HelpFormatter,
-                **kwargs)
-
-    #==========================================================================
-    def convert_arg_line_to_args(self, arg_line):
-        ''' convert one line of parsed file to arguments '''
-        arg_line = arg_line.split('#', 1)[0]
-        if sys.platform == 'win32':
-            arg_line = arg_line.replace('\\', '\\\\')
-        for arg in shlex.split(arg_line):
-            if not arg.strip():
-                continue
-            yield arg
-
-    #==========================================================================
-    def _read_args_from_files(self, arg_strings):
-        ''' read arguments from files '''
-        # expand arguments referencing files
-        new_arg_strings = []
-        for arg_string in arg_strings:
-
-            # for regular arguments, just add them back into the list
-            if arg_string[0] not in self.fromfile_prefix_chars:
-                new_arg_strings.append(arg_string)
-
-            # replace arguments referencing files with the file content
-            else:
-                filename = arg_string[1:]
-
-                # Search in sys.path
-                if not os.path.exists(filename):
-                    for path in sys.path:
-                        filename = os.path.join(path, arg_string[1:])
-                        if os.path.exists(filename):
-                            break
-
-                try:
-                    args_file = open(filename)
-                    try:
-                        arg_strings = []
-                        for arg_line in args_file.read().splitlines():
-                            for arg in self.convert_arg_line_to_args(arg_line):
-                                arg_strings.append(arg)
-                        arg_strings = self._read_args_from_files(arg_strings)
-                        new_arg_strings.extend(arg_strings)
-                    finally:
-                        args_file.close()
-                except IOError:
-                    err = sys.exc_info()[1]
-                    self.error(str(err))
-
-        # return the modified argument list
-        return new_arg_strings
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py
deleted file mode 100644
index bd39ef645f7..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-# Python source
-import os
-import sys
-import knob_defs
-from gen_common import *
-
-def main(args=sys.argv[1:]):
-
-    # parse args
-    parser = ArgumentParser()
-    parser.add_argument("--output", "-o", help="Path to output file", required=True)
-    parser.add_argument("--gen_h", "-gen_h", help="Generate gen_knobs.h", action="store_true", default=False)
-    parser.add_argument("--gen_cpp", "-gen_cpp", help="Generate gen_knobs.cpp", action="store_true", required=False)
-
-    args = parser.parse_args()
-
-    cur_dir = os.path.dirname(os.path.abspath(__file__))
-    template_cpp = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp')
-    template_h = os.path.join(cur_dir, 'templates', 'gen_knobs.h')
-
-    output_filename = os.path.basename(args.output)
-    output_dir = MakeTmpDir('_codegen')
-
-    output_file = os.path.join(output_dir, output_filename)
-
-    rval = 0
-
-    try:
-        if args.gen_h:
-            MakoTemplateWriter.to_file(
-                template_h,
-                output_file,
-                cmdline=sys.argv,
-                filename='gen_knobs',
-                knobs=knob_defs.KNOBS)
-
-        if args.gen_cpp:
-            MakoTemplateWriter.to_file(
-                template_cpp,
-                output_file,
-                cmdline=sys.argv,
-                filename='gen_knobs',
-                knobs=knob_defs.KNOBS,
-                includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip'])
-
-        rval = CopyFileIfDifferent(output_file, args.output)
-
-    except:
-        rval = 1
-
-    finally:
-        # ignore errors from delete of tmp directory
-        DeleteDirTree(output_dir)
-
-    return 0
-
-if __name__ == '__main__':
-    sys.exit(main())
-
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
deleted file mode 100644
index f3ab7120a43..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-import os, sys, re
-from gen_common import *
-from argparse import FileType
-
-inst_aliases = {
-    'SHUFFLE_VECTOR': 'VSHUFFLE',
-    'INSERT_ELEMENT': 'VINSERT',
-    'EXTRACT_ELEMENT': 'VEXTRACT',
-    'MEM_SET': 'MEMSET',
-    'MEM_CPY': 'MEMCOPY',
-    'MEM_MOVE': 'MEMMOVE',
-    'L_SHR': 'LSHR',
-    'A_SHR': 'ASHR',
-    'BIT_CAST': 'BITCAST',
-    'U_DIV': 'UDIV',
-    'S_DIV': 'SDIV',
-    'U_REM': 'UREM',
-    'S_REM': 'SREM',
-    'BIN_OP': 'BINOP',
-}
-
-intrinsics = [
-    ['VGATHERPD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
-    ['VGATHERPS',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
-    ['VGATHERDD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
-    ['VSCATTERPS',  ['pBase', 'mask', 'indices', 'src', 'scale'], 'src'],
-    ['VRCPPS',      ['a'], 'a'],
-    ['VROUND',      ['a', 'rounding'], 'a'],
-    ['BEXTR_32',    ['src', 'control'], 'src'],
-    ['VPSHUFB',     ['a', 'b'], 'a'],
-    ['VPERMD',      ['a', 'idx'], 'a'],
-    ['VPERMPS',     ['idx', 'a'], 'a'],
-    ['VCVTPD2PS',   ['a'], 'getVectorType(mFP32Ty, VEC_GET_NUM_ELEMS)'],
-    ['VCVTPS2PH',   ['a', 'round'], 'mSimdInt16Ty'],
-    ['VHSUBPS',     ['a', 'b'], 'a'],
-    ['VPTESTC',     ['a', 'b'], 'mInt32Ty'],
-    ['VPTESTZ',     ['a', 'b'], 'mInt32Ty'],
-    ['VPHADDD',     ['a', 'b'], 'a'],
-    ['PDEP32',      ['a', 'b'], 'a'],
-    ['RDTSC',       [], 'mInt64Ty'],
-]
-
-llvm_intrinsics = [
-    ['CTTZ', 'cttz', ['a', 'flag'], ['a']],
-    ['CTLZ', 'ctlz', ['a', 'flag'], ['a']],
-    ['VSQRTPS', 'sqrt', ['a'], ['a']],
-    ['STACKSAVE', 'stacksave', [], []],
-    ['STACKRESTORE', 'stackrestore', ['a'], []],
-    ['VMINPS', 'minnum', ['a', 'b'], ['a']],
-    ['VMAXPS', 'maxnum', ['a', 'b'], ['a']],
-    ['VFMADDPS', 'fmuladd', ['a', 'b', 'c'], ['a']],
-    ['DEBUGTRAP', 'debugtrap', [], []],
-    ['POPCNT', 'ctpop', ['a'], ['a']],
-    ['LOG2', 'log2', ['a'], ['a']],
-    ['FABS', 'fabs', ['a'], ['a']],
-    ['EXP2', 'exp2', ['a'], ['a']],
-    ['COS', 'cos', ['a'], ['a']],
-    ['SIN', 'sin', ['a'], ['a']],
-    ['FLOOR', 'floor', ['a'], ['a']],
-    ['POW', 'pow', ['a', 'b'], ['a']]
-]
-
-this_dir = os.path.dirname(os.path.abspath(__file__))
-template = os.path.join(this_dir, 'templates', 'gen_builder.hpp')
-
-def convert_uppercamel(name):
-    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
-    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).upper()
-
-'''
-    Given an input file (e.g. IRBuilder.h) generates function dictionary.
-'''
-def parse_ir_builder(input_file):
-
-    functions = []
-
-    lines = input_file.readlines()
-    deprecated = None
-
-    idx = 0
-    while idx < len(lines) - 1:
-        line = lines[idx].rstrip()
-        idx += 1
-
-        if deprecated is None:
-            deprecated = re.search(r'LLVM_ATTRIBUTE_DEPRECATED', line)
-
-        #match = re.search(r'\*Create', line)
-        match = re.search(r'[\*\s]Create(\w*)\(', line)
-        if match is not None:
-            #print('Line: %s' % match.group(1))
-
-            # Skip function if LLVM_ATTRIBUTE_DEPRECATED found before
-            if deprecated is not None:
-                deprecated = None
-                continue
-
-            if re.search(r'^\s*Create', line) is not None:
-                func_sig = lines[idx-2].rstrip() + line
-            else:
-                func_sig = line
-
-            end_of_args = False
-            while not end_of_args:
-                end_paren = re.search(r'\)', line)
-                if end_paren is not None:
-                    end_of_args = True
-                else:
-                    line = lines[idx].rstrip()
-                    func_sig += line
-                    idx += 1
-
-            delfunc = re.search(r'LLVM_DELETED_FUNCTION|= delete;', func_sig)
-
-            if not delfunc:
-                func = re.search(r'(.*?)\*[\n\s]*(Create\w*)\((.*?)\)', func_sig)
-                if func is not None:
-
-                    return_type = func.group(1).strip() + '*'
-                    func_name = func.group(2)
-                    arguments = func.group(3)
-
-                    func_args = []
-                    arg_names = []
-                    args = arguments.split(',')
-                    for arg in args:
-                        arg = arg.strip()
-                        if arg:
-                            func_args.append(arg)
-
-                            split_args = arg.split('=')
-                            arg_name = split_args[0].rsplit(None, 1)[-1]
-
-                            reg_arg = re.search(r'[\&\*]*(\w*)', arg_name)
-                            if reg_arg:
-                                arg_names += [reg_arg.group(1)]
-
-                    ignore = False
-
-                    # The following functions need to be ignored in openswr.
-                    # API change in llvm-5.0 breaks baked autogen files
-                    if (
-                        (func_name == 'CreateFence' or
-                         func_name == 'CreateAtomicCmpXchg' or
-                         func_name == 'CreateAtomicRMW')):
-                        ignore = True
-
-                    # The following functions need to be ignored.
-                    if (func_name == 'CreateInsertNUWNSWBinOp' or
-                        func_name == 'CreateMaskedIntrinsic' or
-                        func_name == 'CreateAlignmentAssumptionHelper' or
-                        func_name == 'CreateGEP' or
-                        func_name == 'CreateLoad' or
-                        func_name == 'CreateMaskedLoad' or
-                        func_name == 'CreateStore' or
-                        func_name == 'CreateMaskedStore' or
-                        func_name == 'CreateFCmpHelper' or
-                        func_name == 'CreateElementUnorderedAtomicMemCpy'):
-                        ignore = True
-
-                    # Convert CamelCase to CAMEL_CASE
-                    func_mod = re.search(r'Create(\w*)', func_name)
-                    if func_mod:
-                        func_mod = func_mod.group(1)
-                        func_mod = convert_uppercamel(func_mod)
-                        if func_mod[0:2] == 'F_' or func_mod[0:2] == 'I_':
-                            func_mod = func_mod[0] + func_mod[2:]
-
-                    # Substitute alias based on CAMEL_CASE name.
-                    func_alias = inst_aliases.get(func_mod)
-                    if not func_alias:
-                        func_alias = func_mod
-
-                        if func_name == 'CreateCall' or func_name == 'CreateGEP':
-                            arglist = re.search(r'ArrayRef', ', '.join(func_args))
-                            if arglist:
-                                func_alias = func_alias + 'A'
-
-                    if not ignore:
-                        functions.append({
-                                'name'      : func_name,
-                                'alias'     : func_alias,
-                                'return'    : return_type,
-                                'args'      : ', '.join(func_args),
-                                'arg_names' : arg_names,
-                            })
-
-    return functions
-
-'''
-    Auto-generates macros for LLVM IR
-'''
-def generate_gen_h(functions, output_dir):
-    filename = 'gen_builder.hpp'
-    output_filename = os.path.join(output_dir, filename)
-
-    templfuncs = []
-    for func in functions:
-        decl = '%s %s(%s)' % (func['return'], func['alias'], func['args'])
-
-        templfuncs.append({
-            'decl'      : decl,
-            'intrin'    : func['name'],
-            'args'      : func['arg_names'],
-        })
-
-    MakoTemplateWriter.to_file(
-        template,
-        output_filename,
-        cmdline=sys.argv,
-        comment='Builder IR Wrappers',
-        filename=filename,
-        functions=templfuncs,
-        isX86=False, isIntrin=False)
-
-'''
-    Auto-generates macros for LLVM IR
-'''
-def generate_meta_h(output_dir):
-    filename = 'gen_builder_meta.hpp'
-    output_filename = os.path.join(output_dir, filename)
-
-    functions = []
-    for inst in intrinsics:
-        name = inst[0]
-        args = inst[1]
-        ret = inst[2]
-
-        #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2])))
-        if len(args) != 0:
-            declargs = 'Value* ' + ', Value* '.join(args)
-            decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (name, declargs)
-        else:
-            decl = 'Value* %s(const llvm::Twine& name = "")' % (name)
-
-        # determine the return type of the intrinsic. It can either be:
-        # - type of one of the input arguments
-        # - snippet of code to set the return type
-
-        if ret in args:
-            returnTy = ret + '->getType()'
-        else:
-            returnTy = ret
-
-        functions.append({
-            'decl'      : decl,
-            'name'      : name,
-            'args'      : args,
-            'returnType': returnTy
-        })
-
-    MakoTemplateWriter.to_file(
-        template,
-        output_filename,
-        cmdline=sys.argv,
-        comment='meta intrinsics',
-        filename=filename,
-        functions=functions,
-        isX86=True, isIntrin=False)
-
-def generate_intrin_h(output_dir):
-    filename = 'gen_builder_intrin.hpp'
-    output_filename = os.path.join(output_dir, filename)
-
-    functions = []
-    for inst in llvm_intrinsics:
-        #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2])))
-        if len(inst[2]) != 0:
-            declargs = 'Value* ' + ', Value* '.join(inst[2])
-            decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs)
-        else:
-            decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0])
-
-        functions.append({
-            'decl'      : decl,
-            'intrin'    : inst[1],
-            'args'      : inst[2],
-            'types'     : inst[3],
-        })
-
-    MakoTemplateWriter.to_file(
-        template,
-        output_filename,
-        cmdline=sys.argv,
-        comment='llvm intrinsics',
-        filename=filename,
-        functions=functions,
-        isX86=False, isIntrin=True)
-'''
-    Function which is invoked when this script is started from a command line.
-    Will present and consume a set of arguments which will tell this script how
-    to behave
-'''
-def main():
-
-    # Parse args...
-    parser = ArgumentParser()
-    parser.add_argument('--input', '-i', type=FileType('r'), help='Path to IRBuilder.h', required=False)
-    parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True)
-    parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False)
-    parser.add_argument('--gen_meta_h', help='Generate meta intrinsics. No input is needed.', action='store_true', default=False)
-    parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False)
-    args = parser.parse_args()
-
-    if not os.path.exists(args.output):
-        os.makedirs(args.output)
-
-    final_output_dir = args.output
-    args.output = MakeTmpDir('_codegen')
-
-    rval = 0
-    try:
-        if args.input:
-            functions = parse_ir_builder(args.input)
-
-            if args.gen_h:
-                generate_gen_h(functions, args.output)
-
-        elif args.gen_h:
-            print('Need to specify --input for --gen_h!')
-
-        if args.gen_meta_h:
-            generate_meta_h(args.output)
-
-        if args.gen_intrin_h:
-            generate_intrin_h(args.output)
-
-        rval = CopyDirFilesIfDifferent(args.output, final_output_dir)
-
-    except:
-        print('ERROR: Could not generate llvm_ir_macros', file=sys.stderr)
-        rval = 1
-
-    finally:
-        DeleteDirTree(args.output)
-
-    return rval
-
-if __name__ == '__main__':
-    sys.exit(main())
-# END OF FILE
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
deleted file mode 100644
index 4739f2078d6..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
+++ /dev/null
@@ -1,360 +0,0 @@
-# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-import os, sys, re
-from gen_common import *
-from argparse import FileType
-
-'''
-'''
-def gen_llvm_type(type, name, idx, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file):
-
-    llvm_type = ''
-
-    if is_llvm_struct:
-        if is_pointer or is_pointer_pointer:
-            llvm_type = 'Type::getInt32Ty(ctx)'
-        else:
-            llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type
-    elif is_llvm_enum:
-        llvm_type = 'Type::getInt32Ty(ctx)'
-    elif is_llvm_pfn:
-        llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)'
-    else:
-        if type == 'BYTE' or type == 'char' or type == 'uint8_t' or type == 'int8_t' or type == 'bool':
-            llvm_type = 'Type::getInt8Ty(ctx)'
-        elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t' or type == 'gfxptr_t':
-            llvm_type = 'Type::getInt64Ty(ctx)'
-        elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t':
-            llvm_type = 'Type::getInt16Ty(ctx)'
-        elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t':
-            llvm_type = 'Type::getInt32Ty(ctx)'
-        elif type == 'float' or type == 'FLOAT':
-            llvm_type = 'Type::getFloatTy(ctx)'
-        elif type == 'double' or type == 'DOUBLE':
-            llvm_type = 'Type::getDoubleTy(ctx)'
-        elif type == 'void' or type == 'VOID':
-            llvm_type = 'Type::getInt32Ty(ctx)'
-        elif type == 'HANDLE':
-            llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)'
-        elif type == 'simdscalar':
-            llvm_type = 'getVectorType(Type::getFloatTy(ctx), pJitMgr->mVWidth)'
-        elif type == 'simdscalari':
-            llvm_type = 'getVectorType(Type::getInt32Ty(ctx), pJitMgr->mVWidth)'
-        elif type == 'simd16scalar':
-            llvm_type = 'getVectorType(Type::getFloatTy(ctx), 16)'
-        elif type == 'simd16scalari':
-            llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 16)'
-        elif type == '__m128i':
-            llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 4)'
-        elif type == 'SIMD256::Float':
-            llvm_type = 'getVectorType(Type::getFloatTy(ctx), 8)'
-        elif type == 'SIMD256::Integer':
-            llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 8)'
-        elif type == 'SIMD512::Float':
-            llvm_type = 'getVectorType(Type::getFloatTy(ctx), 16)'
-        elif type == 'SIMD512::Integer':
-            llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 16)'
-        elif type == 'simdvector':
-            llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 8), 4)'
-        elif type == 'simd16vector':
-            llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 16), 4)'
-        elif type == 'SIMD256::Vec4':
-            llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 8), 4)'
-        elif type == 'SIMD512::Vec4':
-            llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 16), 4)'
-        else:
-            llvm_type = 'Gen_%s(pJitMgr)' % type
-
-    if is_pointer:
-        llvm_type = 'PointerType::get(%s, 0)' % llvm_type
-
-    if is_pointer_pointer:
-        llvm_type = 'PointerType::get(%s, 0)' % llvm_type
-
-    if is_array_array:
-        llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count)
-    elif is_array:
-        llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count)
-
-    return {
-        'name'  : name,
-        'lineNum' : idx,
-        'type'  : llvm_type,
-    }
-
-'''
-'''
-def gen_llvm_types(input_file, output_file):
-
-    lines = input_file.readlines()
-
-    types = []
-
-    for idx in range(len(lines)):
-        line = lines[idx].rstrip()
-
-        if 'gen_llvm_types FINI' in line:
-            break
-
-        match = re.match(r'(\s*)struct(\s*)(\w+)', line)
-        if match:
-            llvm_args = []
-
-             # Detect start of structure
-            is_fwd_decl = re.search(r';', line)
-
-            if not is_fwd_decl:
-
-                # Extract the command name
-                struct_name = match.group(3).strip()
-
-                type_entry = {
-                    'name'      : struct_name,
-                    'lineNum'   : idx+1,
-                    'members'   : [],
-                }
-
-                end_of_struct = False
-
-                while not end_of_struct and idx < len(lines)-1:
-                    idx += 1
-                    line = lines[idx].rstrip()
-
-                    is_llvm_typedef = re.search(r'@llvm_typedef', line)
-                    if is_llvm_typedef is not None:
-                        is_llvm_typedef = True
-                        continue
-                    else:
-                        is_llvm_typedef = False
-
-                    ###########################################
-                    # Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure.
-                    is_llvm_struct = re.search(r'@llvm_struct', line)
-
-                    if is_llvm_struct is not None:
-                        is_llvm_struct = True
-                    else:
-                        is_llvm_struct = False
-
-                    ###########################################
-                    # Is field the start of a function? Tells script to ignore it
-                    is_llvm_func_start = re.search(r'@llvm_func_start', line)
-
-                    if is_llvm_func_start is not None:
-                        while not end_of_struct and idx < len(lines)-1:
-                            idx += 1
-                            line = lines[idx].rstrip()
-                            is_llvm_func_end = re.search(r'@llvm_func_end', line)
-                            if is_llvm_func_end is not None:
-                                break;
-                        continue
-
-                    ###########################################
-                    # Is field a function? Tells script to ignore it
-                    is_llvm_func = re.search(r'@llvm_func', line)
-
-                    if is_llvm_func is not None:
-                        continue
-
-                    ###########################################
-                    # Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type.
-                    is_llvm_enum = re.search(r'@llvm_enum', line)
-
-                    if is_llvm_enum is not None:
-                        is_llvm_enum = True
-                    else:
-                        is_llvm_enum = False
-
-                    ###########################################
-                    # Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type.
-                    is_llvm_pfn = re.search(r'@llvm_pfn', line)
-
-                    if is_llvm_pfn is not None:
-                        is_llvm_pfn = True
-                    else:
-                        is_llvm_pfn = False
-
-                    ###########################################
-                    # Is field const?
-                    is_const = re.search(r'\s+const\s+', line)
-
-                    if is_const is not None:
-                        is_const = True
-                    else:
-                        is_const = False
-
-                    ###########################################
-                    # Is field a pointer?
-                    is_pointer_pointer = re.search('\*\*', line)
-
-                    if is_pointer_pointer is not None:
-                        is_pointer_pointer = True
-                    else:
-                        is_pointer_pointer = False
-
-                    ###########################################
-                    # Is field a pointer?
-                    is_pointer = re.search('\*', line)
-
-                    if is_pointer is not None:
-                        is_pointer = True
-                    else:
-                        is_pointer = False
-
-                    ###########################################
-                    # Is field an array of arrays?
-                    # TODO: Can add this to a list.
-                    is_array_array = re.search('\[(\w*)\]\[(\w*)\]', line)
-                    array_count = '0'
-                    array_count1 = '0'
-
-                    if is_array_array is not None:
-                        array_count = is_array_array.group(1)
-                        array_count1 = is_array_array.group(2)
-                        is_array_array = True
-                    else:
-                        is_array_array = False
-
-                    ###########################################
-                    # Is field an array?
-                    is_array = re.search('\[(\w*)\]', line)
-
-                    if is_array is not None:
-                        array_count = is_array.group(1)
-                        is_array = True
-                    else:
-                        is_array = False
-
-                    is_scoped = re.search('::', line)
-
-                    if is_scoped is not None:
-                        is_scoped = True
-                    else:
-                        is_scoped = False
-
-                    type = None
-                    name = None
-                    if is_const and is_pointer:
-
-                        if is_scoped:
-                            field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)', line)
-
-                            type = '%s%s' % (field_match.group(4), field_match.group(5))
-                            name = field_match.group(7)
-                        else:
-                            field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)', line)
-
-                            type = field_match.group(4)
-                            name = field_match.group(6)
-
-                    elif is_pointer:
-                        field_match = re.match(r'(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)', line)
-
-                        if field_match:
-                            type = field_match.group(3)
-                            name = field_match.group(5)
-                    elif is_const:
-                        field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)', line)
-
-                        if field_match:
-                            type = field_match.group(4)
-                            name = field_match.group(6)
-                    else:
-                        if is_scoped:
-                            field_match = re.match(r'\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)', line)
-
-                            if field_match:
-                                type = field_match.group(1) + '::' + field_match.group(2)
-                                name = field_match.group(3)
-                        else:
-                            field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)', line)
-
-                            if field_match:
-                                type = field_match.group(2)
-                                name = field_match.group(4)
-
-                    if is_llvm_typedef is False:
-                        if type is not None:
-                            type_entry['members'].append(
-                                gen_llvm_type(
-                                    type, name, idx+1, is_pointer, is_pointer_pointer, is_array, is_array_array,
-                                    array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file))
-
-                    # Detect end of structure
-                    end_of_struct = re.match(r'(\s*)};', line)
-
-                    if end_of_struct:
-                        types.append(type_entry)
-
-    cur_dir = os.path.dirname(os.path.abspath(__file__))
-    template = os.path.join(cur_dir, 'templates', 'gen_llvm.hpp')
-
-    MakoTemplateWriter.to_file(
-        template,
-        output_file,
-        cmdline=sys.argv,
-        filename=os.path.basename(output_file),
-        types=types,
-        input_dir=os.path.dirname(input_file.name),
-        input_file=os.path.basename(input_file.name))
-
-'''
-    Function which is invoked when this script is started from a command line.
-    Will present and consume a set of arguments which will tell this script how
-    to behave
-'''
-def main():
-
-    # Parse args...
-    parser = ArgumentParser()
-    parser.add_argument('--input', '-i', type=FileType('r'),
-            help='Path to input file containing structs', required=True)
-    parser.add_argument('--output', '-o', action='store',
-            help='Path to output file', required=True)
-    args = parser.parse_args()
-
-    final_output_dir = os.path.dirname(args.output)
-    if MakeDir(final_output_dir):
-        return 1
-
-    final_output_file = args.output
-
-    tmp_dir = MakeTmpDir('_codegen')
-    args.output = os.path.join(tmp_dir, os.path.basename(args.output))
-
-    rval = 0
-    try:
-        gen_llvm_types(args.input, args.output)
-
-        rval = CopyFileIfDifferent(args.output, final_output_file)
-    except:
-        print('ERROR: Could not generate llvm types', file=sys.stderr)
-        rval = 1
-
-    finally:
-        DeleteDirTree(tmp_dir)
-
-    return rval
-
-if __name__ == '__main__':
-    sys.exit(main())
-# END OF FILE
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
deleted file mode 100644
index 75eae353ae1..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-import sys
-
-# Python source
-KNOBS = [
-
-    ['ENABLE_ASSERT_DIALOGS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Use dialogs when asserts fire.',
-                       'Asserts are only enabled in debug builds'],
-        'category'  : 'debug',
-    }],
-
-    ['SINGLE_THREADED', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['If enabled will perform all rendering on the API thread.',
-                       'This is useful mainly for debugging purposes.'],
-        'category'  : 'debug',
-    }],
-
-    ['DUMP_SHADER_IR', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Dumps shader LLVM IR at various stages of jit compilation.'],
-        'category'  : 'debug',
-    }],
-
-    ['USE_GENERIC_STORETILE', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Always use generic function for performing StoreTile.',
-                       'Will be slightly slower than using optimized (jitted) path'],
-        'category'  : 'debug_adv',
-    }],
-
-    ['FAST_CLEAR', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Replace 3D primitive execute with a SWRClearRT operation and',
-                       'defer clear execution to first backend op on hottile, or hottile store'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['MAX_NUMA_NODES', {
-        'type'      : 'uint32_t',
-        'default'   : '1' if sys.platform == 'win32' else '0',
-        'desc'      : ['Maximum # of NUMA-nodes per system used for worker threads',
-                       '  0 == ALL NUMA-nodes in the system',
-                       '  N == Use at most N NUMA-nodes for rendering'],
-        'category'  : 'perf',
-    }],
-
-    ['MAX_CORES_PER_NUMA_NODE', {
-        'type'      : 'uint32_t',
-        'default'   : '0',
-        'desc'      : ['Maximum # of cores per NUMA-node used for worker threads.',
-                       '  0 == ALL non-API thread cores per NUMA-node',
-                       '  N == Use at most N cores per NUMA-node'],
-        'category'  : 'perf',
-    }],
-
-    ['MAX_THREADS_PER_CORE', {
-        'type'      : 'uint32_t',
-        'default'   : '1',
-        'desc'      : ['Maximum # of (hyper)threads per physical core used for worker threads.',
-                       '  0 == ALL hyper-threads per core',
-                       '  N == Use at most N hyper-threads per physical core'],
-        'category'  : 'perf',
-    }],
-
-    ['MAX_WORKER_THREADS', {
-        'type'      : 'uint32_t',
-        'default'   : '0',
-        'desc'      : ['Maximum worker threads to spawn.',
-                       '',
-                       'IMPORTANT: If this is non-zero, no worker threads will be bound to',
-                       'specific HW threads.  They will all be "floating" SW threads.',
-                       'In this case, the above 3 KNOBS will be ignored.'],
-        'category'  : 'perf',
-    }],
-
-    ['BASE_NUMA_NODE', {
-        'type'      : 'uint32_t',
-        'default'   : '0',
-        'desc'      : ['Starting NUMA node index to use when allocating compute resources.',
-                       'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'],
-        'category'  : 'perf',
-    }],
-
-    ['BASE_CORE', {
-        'type'      : 'uint32_t',
-        'default'   : '0',
-        'desc'      : ['Starting core index to use when allocating compute resources.',
-                       'Setting this to a non-zero value will reduce the maximum # of cores used.'],
-        'category'  : 'perf',
-    }],
-
-    ['BASE_THREAD', {
-        'type'      : 'uint32_t',
-        'default'   : '0',
-        'desc'      : ['Starting thread index to use when allocating compute resources.',
-                       'Setting this to a non-zero value will reduce the maximum # of threads used.'],
-        'category'  : 'perf',
-    }],
-
-    ['BUCKETS_START_FRAME', {
-        'type'      : 'uint32_t',
-        'default'   : '1200',
-        'desc'      : ['Frame from when to start saving buckets data.',
-                       '',
-                       'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
-                       'for this to have an effect.'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['BUCKETS_END_FRAME', {
-        'type'      : 'uint32_t',
-        'default'   : '1400',
-        'desc'      : ['Frame at which to stop saving buckets data.',
-                       '',
-                       'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
-                       'for this to have an effect.'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['WORKER_SPIN_LOOP_COUNT', {
-        'type'      : 'uint32_t',
-        'default'   : '5000',
-        'desc'      : ['Number of spin-loop iterations worker threads will perform',
-                       'before going to sleep when waiting for work'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['MAX_DRAWS_IN_FLIGHT', {
-        'type'      : 'uint32_t',
-        'default'   : '256',
-        'desc'      : ['Maximum number of draws outstanding before API thread blocks.',
-                       'This value MUST be evenly divisible into 2^32'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['MAX_PRIMS_PER_DRAW', {
-        'type'      : 'uint32_t',
-        'default'   : '49152',
-        'desc'      : ['Maximum primitives in a single Draw().',
-                       'Larger primitives are split into smaller Draw calls.',
-                       'Should be a multiple of (3 * vectorWidth).'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['MAX_TESS_PRIMS_PER_DRAW', {
-        'type'      : 'uint32_t',
-        'default'   : '16',
-        'desc'      : ['Maximum primitives in a single Draw() with tessellation enabled.',
-                       'Larger primitives are split into smaller Draw calls.',
-                       'Should be a multiple of (vectorWidth).'],
-        'category'  : 'perf_adv',
-    }],
-
-
-    ['DEBUG_OUTPUT_DIR', {
-        'type'      : 'std::string',
-        'default'   : r'%TEMP%\Rast\DebugOutput' if sys.platform == 'win32' else '/tmp/Rast/DebugOutput',
-        'desc'      : ['Output directory for debug data.'],
-        'category'  : 'debug',
-    }],
-
-    ['JIT_ENABLE_CACHE', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Enables caching of compiled shaders'],
-        'category'  : 'debug_adv',
-    }],
-
-    ['JIT_OPTIMIZATION_LEVEL', {
-        'type'      : 'int',
-        'default'   : '-1',
-        'desc'      : ['JIT compile optimization level:',],
-        'category'  : 'debug',
-        'control'   : 'dropdown',
-        'choices' : [
-            {
-                'name'  : 'Automatic',
-                'desc'  : 'Automatic based on other KNOB and build settings',
-                'value' : -1,
-            },
-            {
-                'name'  : 'Debug',
-                'desc'  : 'No optimization: -O0',
-                'value' : 0,
-            },
-            {
-                'name'  : 'Less',
-                'desc'  : 'Some optimization: -O1',
-                'value' : 1,
-            },
-            {
-                'name'  : 'Optimize',
-                'desc'  : 'Default Clang / LLVM optimizations: -O2',
-                'value' : 2,
-            },
-            {
-                'name'  : 'Aggressive',
-                'desc'  : 'Maximum optimization: -O3',
-                'value' : 3,
-            },
-        ],
-    }],
-
-    ['JIT_CACHE_DIR', {
-        'type'      : 'std::string',
-        'default'   : r'%TEMP%\SWR\JitCache' if sys.platform == 'win32' else '${HOME}/.swr/jitcache',
-        'desc'      : ['Cache directory for compiled shaders.'],
-        'category'  : 'debug',
-    }],
-
-    ['TOSS_DRAW', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Disable per-draw/dispatch execution'],
-        'category'  : 'perf',
-    }],
-
-    ['TOSS_QUEUE_FE', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at worker FE',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_FETCH', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at vertex fetch',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_IA', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at input assembler',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_VS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at vertex shader',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_SETUP_TRIS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at primitive setup',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_BIN_TRIS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at primitive binning',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_RS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at rasterizer',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['DISABLE_SPLIT_DRAW', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Don\'t split large draws into smaller draws.,',
-                       'MAX_PRIMS_PER_DRAW and MAX_TESS_PRIMS_PER_DRAW can be used to control split size.',
-                       '',
-                       'Useful to disable split draws for gathering archrast stats.'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['AR_ENABLE_PIPELINE_STATS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Enable pipeline stats when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_SHADER_STATS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Enable shader stats when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_SWTAG_DATA', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Enable SWTag data when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_SWR_EVENTS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Enable internal SWR events when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_PIPELINE_EVENTS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Enable pipeline events when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_SHADER_EVENTS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Enable shader events when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_SWTAG_EVENTS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Enable SWTag events when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_MEMORY_EVENTS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Enable memory events when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_MEM_SET_BYTE_GRANULARITY', {
-        'type'      : 'uint32_t',
-        'default'   : '64',
-        'desc'      : ['Granularity and alignment of tracking of memory accesses',
-                       'ONLY ACTIVE UNDER ArchRast.'],
-        'category'  : 'archrast',
-    }],
-
-
-    ]
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/meson.build b/src/gallium/drivers/swr/rasterizer/codegen/meson.build
deleted file mode 100644
index daf79ed4c26..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/meson.build
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright © 2017-2018 Intel Corporation
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-gen_knobs_cpp = custom_target(
-  'gen_knobs.cpp',
-  input : ['gen_knobs.py'],
-  output : 'gen_knobs.cpp',
-  command : [prog_python, '@INPUT0@', '--output', '@OUTPUT@', '--gen_cpp'],
-  depend_files : files(
-    'knob_defs.py', 'gen_common.py',
-    'templates/gen_knobs.cpp',
-  ),
-)
-
-gen_knobs_h = custom_target(
-  'gen_knobs.h',
-  input : ['gen_knobs.py'],
-  output : 'gen_knobs.h',
-  command : [prog_python, '@INPUT0@', '--output', '@OUTPUT@', '--gen_h'],
-  depend_files : files(
-    'knob_defs.py', 'gen_common.py',
-    'templates/gen_knobs.h',
-  ),
-)
-
-
-# The generators above this are needed individually, while the below generators
-# are all inputs to the same lib, so they don't need unique names.
-files_swr_common += [
-  gen_builder_hpp, gen_builder_meta_hpp, gen_knobs_h, gen_knobs_cpp
-]
-
-foreach x : [[swr_context_files, 'gen_swr_context_llvm.h'],
-             [swr_state_files, 'gen_state_llvm.h'],
-             [swr_surf_state_files, 'gen_surf_state_llvm.h']]
-  files_swr_common += custom_target(
-    x[1],
-    input : ['gen_llvm_types.py', x[0]],
-    output : x[1],
-    command : [prog_python, '@INPUT0@', '--input', '@INPUT1@', '--output', '@OUTPUT@'],
-    depend_files : files(
-      'templates/gen_llvm.hpp',
-      'gen_common.py',
-    ),
-  )
-endforeach
-
-ar_output_filenames = ['gen_ar_event.hpp', 'gen_ar_event.cpp', 'gen_ar_eventhandler.hpp', 'gen_ar_eventhandlerfile.hpp']
-ar_template_filenames = []
-foreach fname : ar_output_filenames
-    ar_template_filenames += join_paths('templates', fname)
-endforeach
-
-files_swr_common += custom_target(
-    'gen_archrast',
-    input : ['gen_archrast.py', swr_event_proto_files, swr_event_pproto_files],
-    output : ar_output_filenames,
-    command : [prog_python, '@INPUT0@', '--proto', '@INPUT1@', '@INPUT2@', '--output-dir', meson.current_build_dir()],
-    depend_files : files('gen_common.py', ar_template_filenames)
-)
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp
deleted file mode 100644
index e73a8110ee1..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}
- *
- * @brief Implementation for events.  auto-generated file
- *
- * DO NOT EDIT
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-#include "common/os.h"
-#include "gen_ar_event.hpp"
-#include "gen_ar_eventhandler.hpp"
-
-using namespace ArchRast;
-
-<%  sorted_groups = sorted(protos['events']['groups']) %>
-%   for group in sorted_groups:
-%       for event_key in protos['events']['groups'][group]:
-<%
-        event = protos['events']['defs'][event_key]
-%>
-void ${event['name']}::Accept(EventHandler* pHandler) const
-{
-    pHandler->Handle(*this);
-}
-%       endfor
-%   endfor
-
-
-// clan-format on
-
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp
deleted file mode 100644
index 3ef99da2249..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}
- *
- * @brief Definitions for events.  auto-generated file
- *
- * DO NOT EDIT
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-#pragma once
-
-#include "common/os.h"
-#include "core/state.h"
-
-<%
-    always_enabled_knob_groups = ['Framework', 'SWTagFramework', 'ApiSwr']
-    group_knob_remap_table = {
-        "ShaderStats": "KNOB_AR_ENABLE_SHADER_STATS",
-        "PipelineStats" : "KNOB_AR_ENABLE_PIPELINE_STATS",
-        "SWTagData" : "KNOB_AR_ENABLE_SWTAG_DATA",
- }
-%>
-namespace ArchRast
-{
-<% sorted_enums = sorted(protos['enums']['defs']) %>
-% for name in sorted_enums:
-    enum ${name}
-    {<% names = protos['enums']['defs'][name]['names'] %>
-        % for i in range(len(names)):
-        ${names[i].lstrip()}
-        % endfor
-    };
-% endfor
-
-    // Forward decl
-    class EventHandler;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// Event - interface for handling events.
-    //////////////////////////////////////////////////////////////////////////
-    struct Event
-    {
-        const uint32_t eventId = {0xFFFFFFFF};
-        Event() {}
-        virtual ~Event() {}
-
-        virtual bool IsEnabled() const { return true; };
-        virtual const uint32_t GetEventId() const = 0;
-        virtual void Accept(EventHandler* pHandler) const = 0;
-    };
-
-<%  sorted_groups = sorted(protos['events']['groups']) %>
-% for group in sorted_groups:
-    % for event_key in protos['events']['groups'][group]:
-<%
-        event = protos['events']['defs'][event_key]
-%>
-    //////////////////////////////////////////////////////////////////////////
-    /// ${event_key}Data
-    //////////////////////////////////////////////////////////////////////////
-#pragma pack(push, 1)
-    struct ${event['name']}Data
-    {<%
-        fields = event['fields'] %>
-        // Fields
-        % for i in range(len(fields)):
-            % if fields[i]['size'] > 1:
-        ${fields[i]['type']} ${fields[i]['name']}[${fields[i]['size']}];
-            % else:
-        ${fields[i]['type']} ${fields[i]['name']};
-            % endif
-        % endfor
-    };
-#pragma pack(pop)
-
-    //////////////////////////////////////////////////////////////////////////
-    /// ${event_key}
-    //////////////////////////////////////////////////////////////////////////
-    struct ${event['name']} : Event
-    {<%
-        fields = event['fields'] %>
-        const uint32_t eventId = {${ event['id'] }};
-        ${event['name']}Data data;
-
-        // Constructor
-        ${event['name']}(
-        % for i in range(len(fields)):
-            % if i < len(fields)-1:
-                % if fields[i]['size'] > 1:
-            ${fields[i]['type']}* ${fields[i]['name']},
-            uint32_t ${fields[i]['name']}_size,
-                % else:
-            ${fields[i]['type']} ${fields[i]['name']},
-                % endif
-            % endif
-            % if i == len(fields)-1:
-                % if fields[i]['size'] > 1:
-            ${fields[i]['type']}* ${fields[i]['name']},
-            uint32_t ${fields[i]['name']}_size
-                % else:
-            ${fields[i]['type']} ${fields[i]['name']}
-                % endif
-            % endif
-        % endfor
-        )
-        {
-        % for i in range(len(fields)):
-            % if fields[i]['size'] > 1:
-                % if fields[i]['type'] == 'char':
-            // Copy size of string (null-terminated) followed by string into entire buffer
-            SWR_ASSERT(${fields[i]['name']}_size + 1 < ${fields[i]['size']} - sizeof(uint32_t), "String length must be less than size of char buffer - size(uint32_t)!");
-            memcpy(data.${fields[i]['name']}, &${fields[i]['name']}_size, sizeof(uint32_t));
-            strcpy_s(data.${fields[i]['name']} + sizeof(uint32_t), ${fields[i]['name']}_size + 1, ${fields[i]['name']});
-                % else:
-            memcpy(data.${fields[i]['name']}, ${fields[i]['name']}, ${fields[i]['name']}_size);
-                % endif
-            % else:
-            data.${fields[i]['name']} = ${fields[i]['name']};
-            % endif
-        % endfor
-        }
-
-        virtual void Accept(EventHandler* pHandler) const;
-        inline const uint32_t GetEventId() const { return eventId; }
-        % if group not in always_enabled_knob_groups:
-        <% 
-            if group in group_knob_remap_table:
-                group_knob_define = group_knob_remap_table[group]
-            else:
-                group_knob_define = 'KNOB_AR_ENABLE_' + group.upper() + '_EVENTS'
-        %>
-        bool IsEnabled() const
-        {
-            static const bool IsEventEnabled = true;    // TODO: Replace with knob for each event
-            return ${group_knob_define} && IsEventEnabled;
-        }
-        % endif
-    };
-
-    % endfor
-
-% endfor
-} // namespace ArchRast
-// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp
deleted file mode 100644
index d3e82e8a4ee..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}
- *
- * @brief Event handler interface.  auto-generated file
- *
- * DO NOT EDIT
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format on
-#pragma once
-
-#include "${event_header}"
-
-namespace ArchRast
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// EventHandler - interface for handling events.
-    //////////////////////////////////////////////////////////////////////////
-    class EventHandler
-    {
-    public:
-        EventHandler() {}
-        virtual ~EventHandler() {}
-
-        virtual void FlushDraw(uint32_t drawId) {}
-
-<%  sorted_groups = sorted(protos['events']['groups']) %>
-%   for group in sorted_groups:
-%       for event_key in protos['events']['groups'][group]:
-<%
-            event = protos['events']['defs'][event_key]
-%>        virtual void Handle(const ${event['name']}& event) {}
-%       endfor
-%   endfor
-    };
-} // namespace ArchRast
-// clan-format off
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
deleted file mode 100644
index ba5a51700f3..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}
- *
- * @brief Event handler interface.  auto-generated file
- *
- * DO NOT EDIT
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-#pragma once
-
-#include "common/os.h"
-#include "${event_header}"
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <thread>
-
-namespace ArchRast
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// EventHandlerFile - interface for handling events.
-    //////////////////////////////////////////////////////////////////////////
-    class EventHandlerFile : public EventHandler
-    {
-    public:
-        EventHandlerFile(uint32_t id) : mBufOffset(0)
-        {
-#if defined(_WIN32)
-            DWORD pid = GetCurrentProcessId();
-            TCHAR procname[MAX_PATH];
-            GetModuleFileName(NULL, procname, MAX_PATH);
-            const char*       pBaseName = strrchr(procname, '\\');
-            std::stringstream outDir;
-            outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
-            mOutputDir = outDir.str();
-            if (CreateDirectory(mOutputDir.c_str(), NULL))
-            {
-                std::cout << std::endl
-                          << "ArchRast Dir:       " << mOutputDir << std::endl
-                          << std::endl
-                          << std::flush;
-            }
-
-            // There could be multiple threads creating thread pools. We
-            // want to make sure they are uniquely identified by adding in
-            // the creator's thread id into the filename.
-            std::stringstream fstr;
-            fstr << outDir.str().c_str() << "\\ar_event" << std::this_thread::get_id();
-            fstr << "_" << id << ".bin" << std::ends;
-            mFilename = fstr.str();
-#else
-            // There could be multiple threads creating thread pools. We
-            // want to make sure they are uniquely identified by adding in
-            // the creator's thread id into the filename.
-            std::stringstream fstr;
-            fstr << "/tmp/ar_event" << std::this_thread::get_id();
-            fstr << "_" << id << ".bin" << std::ends;
-            mFilename = fstr.str();
-#endif
-        }
-
-        virtual ~EventHandlerFile() { FlushBuffer(); }
-
-        //////////////////////////////////////////////////////////////////////////
-        /// @brief Flush buffer to file.
-        bool FlushBuffer()
-        {
-            if (mBufOffset > 0)
-            {
-                if (mBufOffset == mHeaderBufOffset)
-                {
-                    // Nothing to flush. Only header has been generated.
-                    return false;
-                }
-
-                std::ofstream file;
-                file.open(mFilename, std::ios::out | std::ios::app | std::ios::binary);
-
-                if (!file.is_open())
-                {
-                    SWR_INVALID("ArchRast: Could not open event file!");
-                    return false;
-                }
-
-                file.write((char*)mBuffer, mBufOffset);
-                file.close();
-
-                mBufOffset       = 0;
-                mHeaderBufOffset = 0; // Reset header offset so its no longer considered.
-            }
-            return true;
-        }
-
-        //////////////////////////////////////////////////////////////////////////
-        /// @brief Write event and its payload to the memory buffer.
-        void Write(uint32_t eventId, const char* pBlock, uint32_t size)
-        {
-            if ((mBufOffset + size + sizeof(eventId)) > mBufferSize)
-            {
-                if (!FlushBuffer())
-                {
-                    // Don't corrupt what's already in the buffer?
-                    /// @todo Maybe add corrupt marker to buffer here in case we can open file in
-                    /// future?
-                    return;
-                }
-            }
-
-            memcpy(&mBuffer[mBufOffset], (char*)&eventId, sizeof(eventId));
-            mBufOffset += sizeof(eventId);
-            memcpy(&mBuffer[mBufOffset], pBlock, size);
-            mBufOffset += size;
-        }
-<%  sorted_groups = sorted(protos['events']['groups']) %>
-%   for group in sorted_groups:
-%       for event_key in protos['events']['groups'][group]:
-<%
-            event = protos['events']['defs'][event_key]
-%>
-        //////////////////////////////////////////////////////////////////////////
-        /// @brief Handle ${event_key} event
-        virtual void Handle(const ${event['name']}& event)
-        {
-% if event['num_fields'] == 0:
-            Write(event.eventId, (char*)&event.data, 0);
-% else:
-            Write(event.eventId, (char*)&event.data, sizeof(event.data));
-% endif
-        }
-%       endfor
-%   endfor
-
-        //////////////////////////////////////////////////////////////////////////
-        /// @brief Everything written to buffer this point is the header.
-        virtual void MarkHeader()
-        {
-            mHeaderBufOffset = mBufOffset;
-        }
-
-        std::string mFilename;
-        std::string mOutputDir;
-
-        static const uint32_t mBufferSize = 1024;
-        uint8_t               mBuffer[mBufferSize];
-        uint32_t mBufOffset{0};
-        uint32_t mHeaderBufOffset{0};
-    };
-} // namespace ArchRast
-// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
deleted file mode 100644
index b8da5298f3d..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//============================================================================
-// Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice (including the next
-// paragraph) shall be included in all copies or substantial portions of the
-// Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// @file BackendPixelRate${fileNum}.cpp
-//
-// @brief auto-generated file
-//
-// DO NOT EDIT
-//
-// Generation Command Line:
-//  ${'\n//    '.join(cmdline)}
-//
-//============================================================================
-
-#include "core/backend.h"
-#include "core/backend_impl.h"
-
-void InitBackendPixelRate${fileNum}()
-{
-    %for func in funcList:
-    ${func}
-    %endfor
-}
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
deleted file mode 100644
index da1ca87620a..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-//============================================================================
-// Copyright (C) 2014-2020 Intel Corporation.   All Rights Reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice (including the next
-// paragraph) shall be included in all copies or substantial portions of the
-// Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// @file ${filename}
-//
-// @brief auto-generated file
-//
-// DO NOT EDIT
-//
-// Generation Command Line:
-//  ${'\n//    '.join(cmdline)}
-//
-//============================================================================
-// clang-format off
-#pragma once
-
-//============================================================================
-// Auto-generated ${comment}
-//============================================================================
-%for func in functions:
-<%argList = ', '.join(func['args'])%>\
-${func['decl']}
-{
-%if isX86:
-    %if len(func['args']) != 0:
-    SmallVector<Type*, ${len(func['args'])}> argTypes;
-    %for arg in func['args']:
-    argTypes.push_back(${arg}->getType());
-    %endfor
-#if LLVM_VERSION_MAJOR >= 12
-    #define VEC_GET_NUM_ELEMS cast<FixedVectorType>(a->getType())->getNumElements()
-#elif LLVM_VERSION_MAJOR >= 11
-    #define VEC_GET_NUM_ELEMS cast<VectorType>(a->getType())->getNumElements()
-#else
-    #define VEC_GET_NUM_ELEMS a->getType()->getVectorNumElements()
-#endif
-    FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, argTypes, false);
-    %else:
-    FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, {}, false);
-    %endif:
-#if LLVM_VERSION_MAJOR >= 9
-    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy).getCallee());
-#else
-    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy));
-#endif
-    return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
-%elif isIntrin:
-    %if len(func['types']) != 0:
-    SmallVector<Type*, ${len(func['types'])}> args;
-    %for arg in func['types']:
-    args.push_back(${arg}->getType());
-    %endfor
-    Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args);
-    return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
-    %else:
-    Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
-    return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
-    %endif
-%else:
-    return IRB()->${func['intrin']}(${argList});
-%endif
-}
-
-% endfor
-    // clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
deleted file mode 100644
index d0682c55f03..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-//============================================================================
-// Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice (including the next
-// paragraph) shall be included in all copies or substantial portions of the
-// Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// @file ${filename}
-//
-// @brief auto-generated file
-//
-// DO NOT EDIT
-//
-// Generation Command Line:
-//  ${'\n//    '.join(cmdline)}
-//
-//============================================================================
-
-// clang-format off
-
-%for num in range(numFiles):
-void Init${tableName}${num}();
-%endfor
-
-static INLINE void Init${tableName}()
-{
-    %for num in range(numFiles):
-    Init${tableName}${num}();
-    %endfor
-}
-// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
deleted file mode 100644
index 194499aa1e0..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2015-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}.cpp
- *
- * @brief Dynamic Knobs for Core.
- *
- * ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-<% calc_max_knob_len(knobs) %>
-% for inc in includes:
-#include <${inc}>
-% endfor
-#include <regex>
-#include <core/utils.h>
-
-//========================================================
-// Implementation
-//========================================================
-void KnobBase::autoExpandEnvironmentVariables(std::string& text)
-{
-    size_t start;
-    while ((start = text.find("${'${'}")) != std::string::npos)
-    {
-        size_t end = text.find("}");
-        if (end == std::string::npos)
-            break;
-        const std::string var = GetEnv(text.substr(start + 2, end - start - 2));
-        text.replace(start, end - start + 1, var);
-    }
-    // win32 style variable replacement
-    while ((start = text.find("%")) != std::string::npos)
-    {
-        size_t end = text.find("%", start + 1);
-        if (end == std::string::npos)
-            break;
-        const std::string var = GetEnv(text.substr(start + 1, end - start - 1));
-        text.replace(start, end - start + 1, var);
-    }
-}
-
-//========================================================
-// Static Data Members
-//========================================================
-% for knob in knobs:
-% if knob[1]['type'] == 'std::string':
-${knob[1]['type']} GlobalKnobs::Knob_${knob[0]}::m_default = "${repr(knob[1]['default'])[1:-1]}";
-% else:
-${knob[1]['type']} GlobalKnobs::Knob_${knob[0]}::m_default = ${knob[1]['default']};
-% endif
-% endfor
-GlobalKnobs g_GlobalKnobs;
-
-//========================================================
-// Knob Initialization
-//========================================================
-GlobalKnobs::GlobalKnobs()
-{
-    % for knob in knobs :
-    InitKnob(${ knob[0] });
-    % endfor
-}
-
-//========================================================
-// Knob Display (Convert to String)
-//========================================================
-std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
-{
-    std::basic_stringstream<char> str;
-    str << std::showbase << std::setprecision(1) << std::fixed;
-
-    if (optPerLinePrefix == nullptr)
-    {
-        optPerLinePrefix = "";
-    }
-
-    % for knob in knobs:
-    str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
-    % if knob[1]['type'] == 'bool':
-    str << (KNOB_${knob[0]} ? "+\n" : "-\n");
-    % elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string':
-    str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
-    str << std::dec << KNOB_${knob[0]} << "\n";
-    % else:
-    str << KNOB_${knob[0]} << "\n";
-    % endif
-    % endfor
-    str << std::ends;
-
-    return str.str();
-}
-<%!
-    # Globally available python 
-    max_len = 0
-    def calc_max_knob_len(knobs):
-        global max_len
-        max_len = 0
-        for knob in knobs:
-            if len(knob[0]) > max_len: max_len = len(knob[0])
-        max_len += len('KNOB_ ')
-        if max_len % 4: max_len += 4 - (max_len % 4)
-
-    def space_knob(knob):
-        knob_len = len('KNOB_' + knob)
-        return ' '*(max_len - knob_len)
-
-    def calc_max_name_len(choices_array):
-        _max_len = 0
-        for choice in choices_array:
-            if len(choice['name']) > _max_len: _max_len = len(choice['name'])
-
-        if _max_len % 4: _max_len += 4 - (_max_len % 4)
-        return _max_len
-
-    def space_name(name, max_len):
-        name_len = len(name)
-        return ' '*(max_len - name_len)
-%>
-// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
deleted file mode 100644
index 8b88a11706c..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2015-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}.h
- *
- * @brief Dynamic Knobs for Core.
- *
- * ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-<% calc_max_knob_len(knobs) %>
-#pragma once
-#include <string>
-
-struct KnobBase
-{
-private:
-    // Update the input string.
-    static void autoExpandEnvironmentVariables(std::string& text);
-
-protected:
-    // Leave input alone and return new string.
-    static std::string expandEnvironmentVariables(std::string const& input)
-    {
-        std::string text = input;
-        autoExpandEnvironmentVariables(text);
-        return text;
-    }
-
-    template <typename T>
-    static T expandEnvironmentVariables(T const& input)
-    {
-        return input;
-    }
-};
-
-template <typename T>
-struct Knob : KnobBase
-{
-public:
-    const T& Value() const { return m_Value; }
-    const T& Value(T const& newValue)
-    {
-        m_Value = expandEnvironmentVariables(newValue);
-        return Value();
-    }
-
-private:
-    T m_Value;
-};
-
-#define DEFINE_KNOB(_name, _type)                               \\
-
-    struct Knob_##_name : Knob<_type>                           \\
-
-    {                                                           \\
-
-        static const char* Name() { return "KNOB_" #_name; }    \\
-
-        static _type DefaultValue() { return (m_default); }     \\
-
-    private:                                                    \\
-
-        static _type m_default;                                 \\
-
-    } _name;
-
-#define GET_KNOB(_name)             g_GlobalKnobs._name.Value()
-#define SET_KNOB(_name, _newValue)  g_GlobalKnobs._name.Value(_newValue)
-
-struct GlobalKnobs
-{
-    % for knob in knobs:
-    //-----------------------------------------------------------
-    // KNOB_${knob[0]}
-    //
-    % for line in knob[1]['desc']:
-    // ${line}
-    % endfor
-    % if knob[1].get('choices'):
-    <%
-    choices = knob[1].get('choices')
-    _max_len = calc_max_name_len(choices) %>//
-    % for i in range(len(choices)):
-    //     ${choices[i]['name']}${space_name(choices[i]['name'], _max_len)} = ${format(choices[i]['value'], '#010x')}
-    % endfor
-    % endif
-    //
-    DEFINE_KNOB(${knob[0]}, ${knob[1]['type']});
-
-    % endfor
-
-    std::string ToString(const char* optPerLinePrefix="");
-    GlobalKnobs();
-};
-extern GlobalKnobs g_GlobalKnobs;
-
-#undef DEFINE_KNOB
-
-% for knob in knobs:
-#define KNOB_${knob[0]}${space_knob(knob[0])} GET_KNOB(${knob[0]})
-% endfor
-
-<%!
-    # Globally available python 
-    max_len = 0
-    def calc_max_knob_len(knobs):
-        global max_len
-        max_len = 0
-        for knob in knobs:
-            if len(knob[0]) > max_len: max_len = len(knob[0])
-        max_len += len('KNOB_ ')
-        if max_len % 4: max_len += 4 - (max_len % 4)
-
-    def space_knob(knob):
-        knob_len = len('KNOB_' + knob)
-        return ' '*(max_len - knob_len)
-
-    def calc_max_name_len(choices_array):
-        _max_len = 0
-        for choice in choices_array:
-            if len(choice['name']) > _max_len: _max_len = len(choice['name'])
-
-        if _max_len % 4: _max_len += 4 - (_max_len % 4)
-        return _max_len
-
-    def space_name(name, max_len):
-        name_len = len(name)
-        return ' '*(max_len - name_len)
-%>
-// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
deleted file mode 100644
index 99a3f300bba..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}
- *
- * @brief auto-generated file
- *
- * DO NOT EDIT
- *
- * Generation Command Line:
- *   ${'\n *     '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-
-#include <llvm/IR/DerivedTypes.h>
-
-#pragma once
-
-namespace SwrJit
-{
-    using namespace llvm;
-
-%for type in types:
-    INLINE static StructType* Gen_${type['name']}(JitManager* pJitMgr)
-    {
-        %if needs_ctx(type):
-        LLVMContext& ctx = pJitMgr->mContext;
-
-        %endif
-#if LLVM_VERSION_MAJOR >= 12
-        StructType* pRetType = StructType::getTypeByName(pJitMgr->mContext, "${type['name']}");
-#else
-        StructType* pRetType = pJitMgr->mpCurrentModule->getTypeByName("${type['name']}");
-#endif
-        if (pRetType == nullptr)
-        {
-            std::vector<Type*> members =<% (max_type_len, max_name_len) = calc_max_len(type['members']) %>
-            {
-                %for member in type['members']:
-                /* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ ${member['type']},
-                %endfor
-            };
-
-            pRetType = StructType::create(members, "${type['name']}", false);
-
-            // Compute debug metadata
-            llvm::DIBuilder builder(*pJitMgr->mpCurrentModule);
-            llvm::DIFile* pFile = builder.createFile("${input_file}", "${os.path.normpath(input_dir).replace('\\', '/')}");
-
-            std::vector<std::pair<std::string, uint32_t>> dbgMembers =
-            {
-                %for member in type['members']:
-                std::make_pair("${member['name']}", ${pad(len(member['name']), max_name_len)}${member['lineNum']}),
-                %endfor
-            };
-            pJitMgr->CreateDebugStructType(pRetType, "${type['name']}", pFile, ${type['lineNum']}, dbgMembers);
-        }
-
-        return pRetType;
-    }
-
-    %for member in type['members']:
-    static const uint32_t ${type['name']}_${member['name']} ${pad(len(member['name']), max_name_len)}= ${loop.index};
-    %endfor
-
-%endfor
-} // namespace SwrJit
-
-<%! # Global function definitions
-    import os
-    def needs_ctx(struct_type):
-        for m in struct_type.get('members', []):
-            if '(ctx)' in m.get('type', ''):
-                return True
-        return False
-
-    def calc_max_len(fields):
-        max_type_len = 0
-        max_name_len = 0
-        for f in fields:
-            if len(f['type']) > max_type_len: max_type_len = len(f['type'])
-            if len(f['name']) > max_name_len: max_name_len = len(f['name'])
-        return (max_type_len, max_name_len)
-
-    def pad(cur_len, max_len):
-        pad_amt = max_len - cur_len
-        return ' '*pad_amt
-%>
-// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
deleted file mode 100644
index 92e0f406235..00000000000
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//============================================================================
-// Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice (including the next
-// paragraph) shall be included in all copies or substantial portions of the
-// Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// @file gen_rasterizer${fileNum}.cpp
-//
-// @brief auto-generated file
-//
-// DO NOT EDIT
-//
-// Generation Command Line:
-//  ${'\n//    '.join(cmdline)}
-//
-//============================================================================
-// clang-format off
-
-#include "core/rasterizer.h"
-#include "core/rasterizer_impl.h"
-
-void InitRasterizerFuncs${fileNum}()
-{
-    %for func in funcList:
-    ${func}
-    %endfor
-}
-// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
deleted file mode 100644
index e0800f5e88e..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/formats.cpp
+++ /dev/null
@@ -1,9298 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file formats.cpp
- *
- * @brief auto-generated file
- *
- * DO NOT EDIT
- *
- ******************************************************************************/
-
-#include "formats.h"
-
-// lookup table for unorm8 srgb -> float conversion
-const uint32_t srgb8Table[256] = {
-    0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40f, 0x3a9f22b4, 0x3ac6eb61, 0x3aeeb40f, 0x3b0b3e5e,
-    0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518d, 0x3b70f18d, 0x3b83e1c6, 0x3b8fe616, 0x3b9c87fd,
-    0x3ba9c9b5, 0x3bb7ad6f, 0x3bc63549, 0x3bd5635f, 0x3be539c1, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152,
-    0x3c15a703, 0x3c1f45be, 0x3c293e6b, 0x3c3391f7, 0x3c3e4149, 0x3c494d43, 0x3c54b6c7, 0x3c607eb1,
-    0x3c6ca5dc, 0x3c792d22, 0x3c830aa8, 0x3c89af9f, 0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63431,
-    0x3cadd37d, 0x3cb5a601, 0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d3, 0x3cdfd00e, 0x3ce8ddb9,
-    0x3cf22131, 0x3cfb9ac6, 0x3d02a56c, 0x3d0798df, 0x3d0ca7e7, 0x3d11d2b0, 0x3d171965, 0x3d1c7c31,
-    0x3d21fb3c, 0x3d2796b2, 0x3d2d4ebe, 0x3d332384, 0x3d39152e, 0x3d3f23e6, 0x3d454fd4, 0x3d4b991f,
-    0x3d51ffef, 0x3d58846a, 0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c20f, 0x3d7add25, 0x3d810b66,
-    0x3d84b795, 0x3d887330, 0x3d8c3e4a, 0x3d9018f6, 0x3d940345, 0x3d97fd4a, 0x3d9c0716, 0x3da020bb,
-    0x3da44a4b, 0x3da883d7, 0x3daccd70, 0x3db12728, 0x3db59110, 0x3dba0b38, 0x3dbe95b5, 0x3dc33092,
-    0x3dc7dbe2, 0x3dcc97b6, 0x3dd1641f, 0x3dd6412c, 0x3ddb2eef, 0x3de02d77, 0x3de53cd5, 0x3dea5d19,
-    0x3def8e55, 0x3df4d093, 0x3dfa23e8, 0x3dff8861, 0x3e027f07, 0x3e054282, 0x3e080ea5, 0x3e0ae379,
-    0x3e0dc107, 0x3e10a755, 0x3e13966c, 0x3e168e53, 0x3e198f11, 0x3e1c98ae, 0x3e1fab32, 0x3e22c6a3,
-    0x3e25eb09, 0x3e29186c, 0x3e2c4ed2, 0x3e2f8e45, 0x3e32d6c8, 0x3e362865, 0x3e398322, 0x3e3ce706,
-    0x3e405419, 0x3e43ca62, 0x3e4749e8, 0x3e4ad2b1, 0x3e4e64c6, 0x3e52002b, 0x3e55a4e9, 0x3e595307,
-    0x3e5d0a8b, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf, 0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c,
-    0x3e7c1c38, 0x3e8014c2, 0x3e82203c, 0x3e84308d, 0x3e8645ba, 0x3e885fc5, 0x3e8a7eb2, 0x3e8ca283,
-    0x3e8ecb3d, 0x3e90f8e1, 0x3e932b74, 0x3e9562f8, 0x3e979f71, 0x3e99e0e2, 0x3e9c274e, 0x3e9e72b7,
-    0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d289, 0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18333,
-    0x3eb3fc16, 0x3eb67a15, 0x3eb8fd34, 0x3ebb8576, 0x3ebe12e1, 0x3ec0a571, 0x3ec33d2d, 0x3ec5da17,
-    0x3ec87c33, 0x3ecb2383, 0x3ecdd00b, 0x3ed081cd, 0x3ed338cc, 0x3ed5f50b, 0x3ed8b68d, 0x3edb7d54,
-    0x3ede4965, 0x3ee11ac1, 0x3ee3f16b, 0x3ee6cd67, 0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba,
-    0x3ef56976, 0x3ef86594, 0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8,
-    0x3f06f106, 0x3f0884cf, 0x3f0a1b57, 0x3f0bb49d, 0x3f0d50a2, 0x3f0eef69, 0x3f1090f2, 0x3f123540,
-    0x3f13dc53, 0x3f15862d, 0x3f1732cf, 0x3f18e23b, 0x3f1a9471, 0x3f1c4973, 0x3f1e0143, 0x3f1fbbe1,
-    0x3f217950, 0x3f23398f, 0x3f24fca2, 0x3f26c288, 0x3f288b43, 0x3f2a56d5, 0x3f2c253f, 0x3f2df681,
-    0x3f2fca9e, 0x3f31a197, 0x3f337b6c, 0x3f355820, 0x3f3737b3, 0x3f391a26, 0x3f3aff7e, 0x3f3ce7b7,
-    0x3f3ed2d4, 0x3f40c0d6, 0x3f42b1c0, 0x3f44a592, 0x3f469c4d, 0x3f4895f3, 0x3f4a9284, 0x3f4c9203,
-    0x3f4e9470, 0x3f5099cd, 0x3f52a21a, 0x3f54ad59, 0x3f56bb8c, 0x3f58ccb3, 0x3f5ae0cf, 0x3f5cf7e2,
-    0x3f5f11ee, 0x3f612ef2, 0x3f634eef, 0x3f6571ec, 0x3f6797e1, 0x3f69c0d8, 0x3f6beccb, 0x3f6e1bc2,
-    0x3f704db6, 0x3f7282b1, 0x3f74baae, 0x3f76f5b3, 0x3f7933b9, 0x3f7b74cb, 0x3f7db8e0, 0x3f800000,
-};
-
-// order must match SWR_FORMAT
-const SWR_FORMAT_INFO gFormatInfo[] = {
-
-    // R32G32B32A32_FLOAT (0x0)
-    {
-        "R32G32B32A32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {32, 32, 32, 32},             // Bits per component
-        128,                          // Bits per element
-        16,                           // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32B32A32_SINT (0x1)
-    {
-        "R32G32B32A32_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {32, 32, 32, 32},             // Bits per component
-        128,                          // Bits per element
-        16,                           // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32B32A32_UINT (0x2)
-    {
-        "R32G32B32A32_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {32, 32, 32, 32},             // Bits per component
-        128,                          // Bits per element
-        16,                           // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x3)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x4)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R64G64_FLOAT (0x5)
-    {
-        "R64G64_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {64, 64, 0, 0},               // Bits per component
-        128,                          // Bits per element
-        16,                           // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32B32X32_FLOAT (0x6)
-    {
-        "R32G32B32X32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {32, 32, 32, 32},             // Bits per component
-        128,                          // Bits per element
-        16,                           // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32B32A32_SSCALED (0x7)
-    {
-        "R32G32B32A32_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {32, 32, 32, 32},             // Bits per component
-        128,                          // Bits per element
-        16,                           // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32B32A32_USCALED (0x8)
-    {
-        "R32G32B32A32_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {32, 32, 32, 32},             // Bits per component
-        128,                          // Bits per element
-        16,                           // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x9)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xA)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xD)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xF)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x10)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x11)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x12)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x13)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x14)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x15)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x16)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x17)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x18)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x19)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R32G32B32A32_SFIXED (0x20)
-    {
-        "R32G32B32A32_SFIXED",
-        {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {32, 32, 32, 32},             // Bits per component
-        128,                          // Bits per element
-        16,                           // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x21)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x22)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x23)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x24)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x25)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x26)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x27)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x28)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x29)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x2A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x2B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x2C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x2D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x2E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x2F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x30)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x31)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x32)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x33)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x34)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x35)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x36)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x37)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x38)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x39)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x3A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x3B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x3C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x3D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x3E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x3F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R32G32B32_FLOAT (0x40)
-    {
-        "R32G32B32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {32, 32, 32, 0},              // Bits per component
-        96,                           // Bits per element
-        12,                           // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32B32_SINT (0x41)
-    {
-        "R32G32B32_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {32, 32, 32, 0},              // Bits per component
-        96,                           // Bits per element
-        12,                           // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32B32_UINT (0x42)
-    {
-        "R32G32B32_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {32, 32, 32, 0},              // Bits per component
-        96,                           // Bits per element
-        12,                           // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x43)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x44)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R32G32B32_SSCALED (0x45)
-    {
-        "R32G32B32_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {32, 32, 32, 0},              // Bits per component
-        96,                           // Bits per element
-        12,                           // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32B32_USCALED (0x46)
-    {
-        "R32G32B32_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {32, 32, 32, 0},              // Bits per component
-        96,                           // Bits per element
-        12,                           // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x47)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x48)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x49)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x4A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x4B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x4C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x4D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x4E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x4F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R32G32B32_SFIXED (0x50)
-    {
-        "R32G32B32_SFIXED",
-        {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {32, 32, 32, 0},              // Bits per component
-        96,                           // Bits per element
-        12,                           // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x51)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x52)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x53)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x54)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x55)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x56)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x57)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x58)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x59)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x5A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x5B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x5C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x5D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x5E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x5F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x60)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x61)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x62)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x63)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x64)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x65)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x66)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x67)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x68)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x69)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x6A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x6B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x6C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x6D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x6E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x6F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x70)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x71)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x72)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x73)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x74)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x75)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x76)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x77)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x78)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x79)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x7A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x7B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x7C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x7D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x7E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x7F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R16G16B16A16_UNORM (0x80)
-    {
-        "R16G16B16A16_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {0, 1, 2, 3},             // Swizzle
-        {16, 16, 16, 16},         // Bits per component
-        64,                       // Bits per element
-        8,                        // Bytes per element
-        4,                        // Num components
-        false,                    // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 65535.0f,
-         1.0f / 65535.0f,
-         1.0f / 65535.0f,
-         1.0f / 65535.0f}, // To float scale factor
-        1,                 // bcWidth
-        1,                 // bcHeight
-    },
-
-    // R16G16B16A16_SNORM (0x81)
-    {
-        "R16G16B16A16_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {0, 1, 2, 3},             // Swizzle
-        {16, 16, 16, 16},         // Bits per component
-        64,                       // Bits per element
-        8,                        // Bytes per element
-        4,                        // Num components
-        false,                    // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 32767.0f,
-         1.0f / 32767.0f,
-         1.0f / 32767.0f,
-         1.0f / 32767.0f}, // To float scale factor
-        1,                 // bcWidth
-        1,                 // bcHeight
-    },
-
-    // R16G16B16A16_SINT (0x82)
-    {
-        "R16G16B16A16_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {16, 16, 16, 16},             // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16B16A16_UINT (0x83)
-    {
-        "R16G16B16A16_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {16, 16, 16, 16},             // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16B16A16_FLOAT (0x84)
-    {
-        "R16G16B16A16_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {16, 16, 16, 16},             // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32_FLOAT (0x85)
-    {
-        "R32G32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32_SINT (0x86)
-    {
-        "R32G32_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32_UINT (0x87)
-    {
-        "R32G32_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32_FLOAT_X8X24_TYPELESS (0x88)
-    {
-        "R32_FLOAT_X8X24_TYPELESS",
-        {SWR_TYPE_FLOAT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // X32_TYPELESS_G8X24_UINT (0x89)
-    {
-        "X32_TYPELESS_G8X24_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // L32A32_FLOAT (0x8A)
-    {
-        "L32A32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 3, 0, 0},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x8B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x8C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R64_FLOAT (0x8D)
-    {
-        "R64_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {64, 0, 0, 0},                // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16B16X16_UNORM (0x8E)
-    {
-        "R16G16B16X16_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},     // Defaults for missing components
-        {0, 1, 2, 3},              // Swizzle
-        {16, 16, 16, 16},          // Bits per component
-        64,                        // Bits per element
-        8,                         // Bytes per element
-        4,                         // Num components
-        false,                     // isSRGB
-        false,                     // isBC
-        false,                     // isSubsampled
-        false,                     // isLuminance
-        {true, true, true, false}, // Is normalized?
-        {1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f}, // To float scale factor
-        1,                                                         // bcWidth
-        1,                                                         // bcHeight
-    },
-
-    // R16G16B16X16_FLOAT (0x8F)
-    {
-        "R16G16B16X16_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {16, 16, 16, 16},             // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x90)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // L32X32_FLOAT (0x91)
-    {
-        "L32X32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 3, 0, 0},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // I32X32_FLOAT (0x92)
-    {
-        "I32X32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 3, 0, 0},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16B16A16_SSCALED (0x93)
-    {
-        "R16G16B16A16_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {16, 16, 16, 16},             // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16B16A16_USCALED (0x94)
-    {
-        "R16G16B16A16_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {16, 16, 16, 16},             // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32_SSCALED (0x95)
-    {
-        "R32G32_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32G32_USCALED (0x96)
-    {
-        "R32G32_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x97)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x98)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x99)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x9A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x9B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x9C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x9D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x9E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x9F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R32G32_SFIXED (0xA0)
-    {
-        "R32G32_SFIXED",
-        {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {32, 32, 0, 0},               // Bits per component
-        64,                           // Bits per element
-        8,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0xA1)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xA2)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xA3)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xA4)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xA5)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xA6)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xA7)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xA8)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xA9)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xAA)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xAB)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xAC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xAD)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xAE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xAF)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB0)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB1)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB2)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB3)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB4)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB5)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB6)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB7)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB8)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xB9)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xBA)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xBB)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xBC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xBD)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xBE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xBF)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // B8G8R8A8_UNORM (0xC0)
-    {
-        "B8G8R8A8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {2, 1, 0, 3},             // Swizzle
-        {8, 8, 8, 8},             // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        false,                    // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor
-        1,                                                            // bcWidth
-        1,                                                            // bcHeight
-    },
-
-    // B8G8R8A8_UNORM_SRGB (0xC1)
-    {
-        "B8G8R8A8_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {2, 1, 0, 3},             // Swizzle
-        {8, 8, 8, 8},             // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        true,                     // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor
-        1,                                                            // bcWidth
-        1,                                                            // bcHeight
-    },
-
-    // R10G10B10A2_UNORM (0xC2)
-    {
-        "R10G10B10A2_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {0, 1, 2, 3},             // Swizzle
-        {10, 10, 10, 2},          // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        false,                    // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor
-        1,                                                             // bcWidth
-        1,                                                             // bcHeight
-    },
-
-    // R10G10B10A2_UNORM_SRGB (0xC3)
-    {
-        "R10G10B10A2_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {0, 1, 2, 3},             // Swizzle
-        {10, 10, 10, 2},          // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        true,                     // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor
-        1,                                                             // bcWidth
-        1,                                                             // bcHeight
-    },
-
-    // R10G10B10A2_UINT (0xC4)
-    {
-        "R10G10B10A2_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {10, 10, 10, 2},              // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0xC5)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xC6)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R8G8B8A8_UNORM (0xC7)
-    {
-        "R8G8B8A8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {0, 1, 2, 3},             // Swizzle
-        {8, 8, 8, 8},             // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        false,                    // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor
-        1,                                                            // bcWidth
-        1,                                                            // bcHeight
-    },
-
-    // R8G8B8A8_UNORM_SRGB (0xC8)
-    {
-        "R8G8B8A8_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {0, 1, 2, 3},             // Swizzle
-        {8, 8, 8, 8},             // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        true,                     // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor
-        1,                                                            // bcWidth
-        1,                                                            // bcHeight
-    },
-
-    // R8G8B8A8_SNORM (0xC9)
-    {
-        "R8G8B8A8_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {0, 1, 2, 3},             // Swizzle
-        {8, 8, 8, 8},             // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        false,                    // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f}, // To float scale factor
-        1,                                                            // bcWidth
-        1,                                                            // bcHeight
-    },
-
-    // R8G8B8A8_SINT (0xCA)
-    {
-        "R8G8B8A8_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {8, 8, 8, 8},                 // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R8G8B8A8_UINT (0xCB)
-    {
-        "R8G8B8A8_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {8, 8, 8, 8},                 // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16_UNORM (0xCC)
-    {
-        "R16G16_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                    // Defaults for missing components
-        {0, 1, 0, 0},                             // Swizzle
-        {16, 16, 0, 0},                           // Bits per component
-        32,                                       // Bits per element
-        4,                                        // Bytes per element
-        2,                                        // Num components
-        false,                                    // isSRGB
-        false,                                    // isBC
-        false,                                    // isSubsampled
-        false,                                    // isLuminance
-        {true, true, false, false},               // Is normalized?
-        {1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0}, // To float scale factor
-        1,                                        // bcWidth
-        1,                                        // bcHeight
-    },
-
-    // R16G16_SNORM (0xCD)
-    {
-        "R16G16_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                    // Defaults for missing components
-        {0, 1, 0, 0},                             // Swizzle
-        {16, 16, 0, 0},                           // Bits per component
-        32,                                       // Bits per element
-        4,                                        // Bytes per element
-        2,                                        // Num components
-        false,                                    // isSRGB
-        false,                                    // isBC
-        false,                                    // isSubsampled
-        false,                                    // isLuminance
-        {true, true, false, false},               // Is normalized?
-        {1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0}, // To float scale factor
-        1,                                        // bcWidth
-        1,                                        // bcHeight
-    },
-
-    // R16G16_SINT (0xCE)
-    {
-        "R16G16_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {16, 16, 0, 0},               // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16_UINT (0xCF)
-    {
-        "R16G16_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {16, 16, 0, 0},               // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16_FLOAT (0xD0)
-    {
-        "R16G16_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {16, 16, 0, 0},               // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // B10G10R10A2_UNORM (0xD1)
-    {
-        "B10G10R10A2_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {2, 1, 0, 3},             // Swizzle
-        {10, 10, 10, 2},          // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        false,                    // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor
-        1,                                                             // bcWidth
-        1,                                                             // bcHeight
-    },
-
-    // B10G10R10A2_UNORM_SRGB (0xD2)
-    {
-        "B10G10R10A2_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {2, 1, 0, 3},             // Swizzle
-        {10, 10, 10, 2},          // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        true,                     // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor
-        1,                                                             // bcWidth
-        1,                                                             // bcHeight
-    },
-
-    // R11G11B10_FLOAT (0xD3)
-    {
-        "R11G11B10_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {11, 11, 10, 0},              // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0xD4)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-
-    // R10G10B10_FLOAT_A2_UNORM (0xD5)
-    {
-        "R10G10B10_FLOAT_A2_UNORM",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},           // Defaults for missing components
-        {0, 1, 2, 3},                    // Swizzle
-        {10, 10, 10, 2},                 // Bits per component
-        32,                              // Bits per element
-        4,                               // Bytes per element
-        4,                               // Num components
-        false,                           // isSRGB
-        false,                           // isBC
-        false,                           // isSubsampled
-        false,                           // isLuminance
-        {false, false, false, false},    // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f / 3.0f}, // To float scale factor
-        1,                               // bcWidth
-        1,                               // bcHeight
-    },
-
-    // R32_SINT (0xD6)
-    {
-        "R32_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {32, 0, 0, 0},                // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32_UINT (0xD7)
-    {
-        "R32_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {32, 0, 0, 0},                // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32_FLOAT (0xD8)
-    {
-        "R32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {32, 0, 0, 0},                // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R24_UNORM_X8_TYPELESS (0xD9)
-    {
-        "R24_UNORM_X8_TYPELESS",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},         // Defaults for missing components
-        {0, 1, 2, 3},                  // Swizzle
-        {24, 0, 0, 0},                 // Bits per component
-        32,                            // Bits per element
-        4,                             // Bytes per element
-        1,                             // Num components
-        false,                         // isSRGB
-        false,                         // isBC
-        false,                         // isSubsampled
-        false,                         // isLuminance
-        {true, false, false, false},   // Is normalized?
-        {1.0f / 16777215.0f, 0, 0, 0}, // To float scale factor
-        1,                             // bcWidth
-        1,                             // bcHeight
-    },
-
-    // X24_TYPELESS_G8_UINT (0xDA)
-    {
-        "X24_TYPELESS_G8_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {1, 0, 0, 0},                 // Swizzle
-        {32, 0, 0, 0},                // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0xDB)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xDC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // L32_UNORM (0xDD)
-    {
-        "L32_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},           // Defaults for missing components
-        {0, 0, 0, 0},                    // Swizzle
-        {32, 0, 0, 0},                   // Bits per component
-        32,                              // Bits per element
-        4,                               // Bytes per element
-        1,                               // Num components
-        false,                           // isSRGB
-        false,                           // isBC
-        false,                           // isSubsampled
-        true,                            // isLuminance
-        {true, false, false, false},     // Is normalized?
-        {1.0f / 4294967295.0f, 0, 0, 0}, // To float scale factor
-        1,                               // bcWidth
-        1,                               // bcHeight
-    },
-
-    // padding (0xDE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // L16A16_UNORM (0xDF)
-    {
-        "L16A16_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                    // Defaults for missing components
-        {0, 3, 0, 0},                             // Swizzle
-        {16, 16, 0, 0},                           // Bits per component
-        32,                                       // Bits per element
-        4,                                        // Bytes per element
-        2,                                        // Num components
-        false,                                    // isSRGB
-        false,                                    // isBC
-        false,                                    // isSubsampled
-        true,                                     // isLuminance
-        {true, true, false, false},               // Is normalized?
-        {1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0}, // To float scale factor
-        1,                                        // bcWidth
-        1,                                        // bcHeight
-    },
-
-    // I24X8_UNORM (0xE0)
-    {
-        "I24X8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                     // Defaults for missing components
-        {0, 3, 0, 0},                              // Swizzle
-        {24, 8, 0, 0},                             // Bits per component
-        32,                                        // Bits per element
-        4,                                         // Bytes per element
-        2,                                         // Num components
-        false,                                     // isSRGB
-        false,                                     // isBC
-        false,                                     // isSubsampled
-        true,                                      // isLuminance
-        {true, true, false, false},                // Is normalized?
-        {1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor
-        1,                                         // bcWidth
-        1,                                         // bcHeight
-    },
-
-    // L24X8_UNORM (0xE1)
-    {
-        "L24X8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                     // Defaults for missing components
-        {0, 3, 0, 0},                              // Swizzle
-        {24, 8, 0, 0},                             // Bits per component
-        32,                                        // Bits per element
-        4,                                         // Bytes per element
-        2,                                         // Num components
-        false,                                     // isSRGB
-        false,                                     // isBC
-        false,                                     // isSubsampled
-        true,                                      // isLuminance
-        {true, true, false, false},                // Is normalized?
-        {1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor
-        1,                                         // bcWidth
-        1,                                         // bcHeight
-    },
-
-    // padding (0xE2)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // I32_FLOAT (0xE3)
-    {
-        "I32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {32, 0, 0, 0},                // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // L32_FLOAT (0xE4)
-    {
-        "L32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {32, 0, 0, 0},                // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // A32_FLOAT (0xE5)
-    {
-        "A32_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {3, 0, 0, 0},                 // Swizzle
-        {32, 0, 0, 0},                // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0xE6)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xE7)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xE8)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // B8G8R8X8_UNORM (0xE9)
-    {
-        "B8G8R8X8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},                               // Defaults for missing components
-        {2, 1, 0, 3},                                        // Swizzle
-        {8, 8, 8, 8},                                        // Bits per component
-        32,                                                  // Bits per element
-        4,                                                   // Bytes per element
-        4,                                                   // Num components
-        false,                                               // isSRGB
-        false,                                               // isBC
-        false,                                               // isSubsampled
-        false,                                               // isLuminance
-        {true, true, true, false},                           // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor
-        1,                                                   // bcWidth
-        1,                                                   // bcHeight
-    },
-
-    // B8G8R8X8_UNORM_SRGB (0xEA)
-    {
-        "B8G8R8X8_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},                               // Defaults for missing components
-        {2, 1, 0, 3},                                        // Swizzle
-        {8, 8, 8, 8},                                        // Bits per component
-        32,                                                  // Bits per element
-        4,                                                   // Bytes per element
-        4,                                                   // Num components
-        true,                                                // isSRGB
-        false,                                               // isBC
-        false,                                               // isSubsampled
-        false,                                               // isLuminance
-        {true, true, true, false},                           // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor
-        1,                                                   // bcWidth
-        1,                                                   // bcHeight
-    },
-
-    // R8G8B8X8_UNORM (0xEB)
-    {
-        "R8G8B8X8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},                               // Defaults for missing components
-        {0, 1, 2, 3},                                        // Swizzle
-        {8, 8, 8, 8},                                        // Bits per component
-        32,                                                  // Bits per element
-        4,                                                   // Bytes per element
-        4,                                                   // Num components
-        false,                                               // isSRGB
-        false,                                               // isBC
-        false,                                               // isSubsampled
-        false,                                               // isLuminance
-        {true, true, true, false},                           // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor
-        1,                                                   // bcWidth
-        1,                                                   // bcHeight
-    },
-
-    // R8G8B8X8_UNORM_SRGB (0xEC)
-    {
-        "R8G8B8X8_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},                               // Defaults for missing components
-        {0, 1, 2, 3},                                        // Swizzle
-        {8, 8, 8, 8},                                        // Bits per component
-        32,                                                  // Bits per element
-        4,                                                   // Bytes per element
-        4,                                                   // Num components
-        true,                                                // isSRGB
-        false,                                               // isBC
-        false,                                               // isSubsampled
-        false,                                               // isLuminance
-        {true, true, true, false},                           // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor
-        1,                                                   // bcWidth
-        1,                                                   // bcHeight
-    },
-
-    // R9G9B9E5_SHAREDEXP (0xED)
-    {
-        "R9G9B9E5_SHAREDEXP",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {9, 9, 9, 5},                 // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // B10G10R10X2_UNORM (0xEE)
-    {
-        "B10G10R10X2_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},                                  // Defaults for missing components
-        {2, 1, 0, 3},                                           // Swizzle
-        {10, 10, 10, 2},                                        // Bits per component
-        32,                                                     // Bits per element
-        4,                                                      // Bytes per element
-        4,                                                      // Num components
-        false,                                                  // isSRGB
-        false,                                                  // isBC
-        false,                                                  // isSubsampled
-        false,                                                  // isLuminance
-        {true, true, true, false},                              // Is normalized?
-        {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f}, // To float scale factor
-        1,                                                      // bcWidth
-        1,                                                      // bcHeight
-    },
-
-    // padding (0xEF)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // L16A16_FLOAT (0xF0)
-    {
-        "L16A16_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 3, 0, 0},                 // Swizzle
-        {16, 16, 0, 0},               // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0xF1)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xF2)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R10G10B10X2_USCALED (0xF3)
-    {
-        "R10G10B10X2_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {10, 10, 10, 2},              // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R8G8B8A8_SSCALED (0xF4)
-    {
-        "R8G8B8A8_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {8, 8, 8, 8},                 // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R8G8B8A8_USCALED (0xF5)
-    {
-        "R8G8B8A8_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {8, 8, 8, 8},                 // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16_SSCALED (0xF6)
-    {
-        "R16G16_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {16, 16, 0, 0},               // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16_USCALED (0xF7)
-    {
-        "R16G16_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {16, 16, 0, 0},               // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32_SSCALED (0xF8)
-    {
-        "R32_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {32, 0, 0, 0},                // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32_USCALED (0xF9)
-    {
-        "R32_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {32, 0, 0, 0},                // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0xFA)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xFB)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xFC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xFD)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xFE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0xFF)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // B5G6R5_UNORM (0x100)
-    {
-        "B5G6R5_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                         // Defaults for missing components
-        {2, 1, 0, 0},                                  // Swizzle
-        {5, 6, 5, 0},                                  // Bits per component
-        16,                                            // Bits per element
-        2,                                             // Bytes per element
-        3,                                             // Num components
-        false,                                         // isSRGB
-        false,                                         // isBC
-        false,                                         // isSubsampled
-        false,                                         // isLuminance
-        {true, true, true, false},                     // Is normalized?
-        {1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0}, // To float scale factor
-        1,                                             // bcWidth
-        1,                                             // bcHeight
-    },
-
-    // B5G6R5_UNORM_SRGB (0x101)
-    {
-        "B5G6R5_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                         // Defaults for missing components
-        {2, 1, 0, 0},                                  // Swizzle
-        {5, 6, 5, 0},                                  // Bits per component
-        16,                                            // Bits per element
-        2,                                             // Bytes per element
-        3,                                             // Num components
-        true,                                          // isSRGB
-        false,                                         // isBC
-        false,                                         // isSubsampled
-        false,                                         // isLuminance
-        {true, true, true, false},                     // Is normalized?
-        {1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0}, // To float scale factor
-        1,                                             // bcWidth
-        1,                                             // bcHeight
-    },
-
-    // B5G5R5A1_UNORM (0x102)
-    {
-        "B5G5R5A1_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},                                   // Defaults for missing components
-        {2, 1, 0, 3},                                            // Swizzle
-        {5, 5, 5, 1},                                            // Bits per component
-        16,                                                      // Bits per element
-        2,                                                       // Bytes per element
-        4,                                                       // Num components
-        false,                                                   // isSRGB
-        false,                                                   // isBC
-        false,                                                   // isSubsampled
-        false,                                                   // isLuminance
-        {true, true, true, true},                                // Is normalized?
-        {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f}, // To float scale factor
-        1,                                                       // bcWidth
-        1,                                                       // bcHeight
-    },
-
-    // B5G5R5A1_UNORM_SRGB (0x103)
-    {
-        "B5G5R5A1_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},                                   // Defaults for missing components
-        {2, 1, 0, 3},                                            // Swizzle
-        {5, 5, 5, 1},                                            // Bits per component
-        16,                                                      // Bits per element
-        2,                                                       // Bytes per element
-        4,                                                       // Num components
-        true,                                                    // isSRGB
-        false,                                                   // isBC
-        false,                                                   // isSubsampled
-        false,                                                   // isLuminance
-        {true, true, true, true},                                // Is normalized?
-        {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f}, // To float scale factor
-        1,                                                       // bcWidth
-        1,                                                       // bcHeight
-    },
-
-    // B4G4R4A4_UNORM (0x104)
-    {
-        "B4G4R4A4_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},                                    // Defaults for missing components
-        {2, 1, 0, 3},                                             // Swizzle
-        {4, 4, 4, 4},                                             // Bits per component
-        16,                                                       // Bits per element
-        2,                                                        // Bytes per element
-        4,                                                        // Num components
-        false,                                                    // isSRGB
-        false,                                                    // isBC
-        false,                                                    // isSubsampled
-        false,                                                    // isLuminance
-        {true, true, true, true},                                 // Is normalized?
-        {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor
-        1,                                                        // bcWidth
-        1,                                                        // bcHeight
-    },
-
-    // B4G4R4A4_UNORM_SRGB (0x105)
-    {
-        "B4G4R4A4_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},                                    // Defaults for missing components
-        {2, 1, 0, 3},                                             // Swizzle
-        {4, 4, 4, 4},                                             // Bits per component
-        16,                                                       // Bits per element
-        2,                                                        // Bytes per element
-        4,                                                        // Num components
-        true,                                                     // isSRGB
-        false,                                                    // isBC
-        false,                                                    // isSubsampled
-        false,                                                    // isLuminance
-        {true, true, true, true},                                 // Is normalized?
-        {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor
-        1,                                                        // bcWidth
-        1,                                                        // bcHeight
-    },
-
-    // R8G8_UNORM (0x106)
-    {
-        "R8G8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                // Defaults for missing components
-        {0, 1, 0, 0},                         // Swizzle
-        {8, 8, 0, 0},                         // Bits per component
-        16,                                   // Bits per element
-        2,                                    // Bytes per element
-        2,                                    // Num components
-        false,                                // isSRGB
-        false,                                // isBC
-        false,                                // isSubsampled
-        false,                                // isLuminance
-        {true, true, false, false},           // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor
-        1,                                    // bcWidth
-        1,                                    // bcHeight
-    },
-
-    // R8G8_SNORM (0x107)
-    {
-        "R8G8_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                // Defaults for missing components
-        {0, 1, 0, 0},                         // Swizzle
-        {8, 8, 0, 0},                         // Bits per component
-        16,                                   // Bits per element
-        2,                                    // Bytes per element
-        2,                                    // Num components
-        false,                                // isSRGB
-        false,                                // isBC
-        false,                                // isSubsampled
-        false,                                // isLuminance
-        {true, true, false, false},           // Is normalized?
-        {1.0f / 127.0f, 1.0f / 127.0f, 0, 0}, // To float scale factor
-        1,                                    // bcWidth
-        1,                                    // bcHeight
-    },
-
-    // R8G8_SINT (0x108)
-    {
-        "R8G8_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {8, 8, 0, 0},                 // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R8G8_UINT (0x109)
-    {
-        "R8G8_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {8, 8, 0, 0},                 // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16_UNORM (0x10A)
-    {
-        "R16_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 0, 0, 0},                // Swizzle
-        {16, 0, 0, 0},               // Bits per component
-        16,                          // Bits per element
-        2,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 65535.0f, 0, 0, 0},  // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // R16_SNORM (0x10B)
-    {
-        "R16_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 0, 0, 0},                // Swizzle
-        {16, 0, 0, 0},               // Bits per component
-        16,                          // Bits per element
-        2,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 32767.0f, 0, 0, 0},  // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // R16_SINT (0x10C)
-    {
-        "R16_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {16, 0, 0, 0},                // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16_UINT (0x10D)
-    {
-        "R16_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {16, 0, 0, 0},                // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16_FLOAT (0x10E)
-    {
-        "R16_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {16, 0, 0, 0},                // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x10F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x110)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // I16_UNORM (0x111)
-    {
-        "I16_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 0, 0, 0},                // Swizzle
-        {16, 0, 0, 0},               // Bits per component
-        16,                          // Bits per element
-        2,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        true,                        // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 65535.0f, 0, 0, 0},  // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // L16_UNORM (0x112)
-    {
-        "L16_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 0, 0, 0},                // Swizzle
-        {16, 0, 0, 0},               // Bits per component
-        16,                          // Bits per element
-        2,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        true,                        // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 65535.0f, 0, 0, 0},  // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // A16_UNORM (0x113)
-    {
-        "A16_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {3, 0, 0, 0},                // Swizzle
-        {16, 0, 0, 0},               // Bits per component
-        16,                          // Bits per element
-        2,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 65535.0f, 0, 0, 0},  // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // L8A8_UNORM (0x114)
-    {
-        "L8A8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                // Defaults for missing components
-        {0, 3, 0, 0},                         // Swizzle
-        {8, 8, 0, 0},                         // Bits per component
-        16,                                   // Bits per element
-        2,                                    // Bytes per element
-        2,                                    // Num components
-        false,                                // isSRGB
-        false,                                // isBC
-        false,                                // isSubsampled
-        true,                                 // isLuminance
-        {true, true, false, false},           // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor
-        1,                                    // bcWidth
-        1,                                    // bcHeight
-    },
-
-    // I16_FLOAT (0x115)
-    {
-        "I16_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {16, 0, 0, 0},                // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // L16_FLOAT (0x116)
-    {
-        "L16_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {16, 0, 0, 0},                // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // A16_FLOAT (0x117)
-    {
-        "A16_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {3, 0, 0, 0},                 // Swizzle
-        {16, 0, 0, 0},                // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // L8A8_UNORM_SRGB (0x118)
-    {
-        "L8A8_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                // Defaults for missing components
-        {0, 3, 0, 0},                         // Swizzle
-        {8, 8, 0, 0},                         // Bits per component
-        16,                                   // Bits per element
-        2,                                    // Bytes per element
-        2,                                    // Num components
-        true,                                 // isSRGB
-        false,                                // isBC
-        false,                                // isSubsampled
-        true,                                 // isLuminance
-        {true, true, false, false},           // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor
-        1,                                    // bcWidth
-        1,                                    // bcHeight
-    },
-
-    // padding (0x119)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // B5G5R5X1_UNORM (0x11A)
-    {
-        "B5G5R5X1_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},                            // Defaults for missing components
-        {2, 1, 0, 3},                                     // Swizzle
-        {5, 5, 5, 1},                                     // Bits per component
-        16,                                               // Bits per element
-        2,                                                // Bytes per element
-        4,                                                // Num components
-        false,                                            // isSRGB
-        false,                                            // isBC
-        false,                                            // isSubsampled
-        false,                                            // isLuminance
-        {true, true, true, false},                        // Is normalized?
-        {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f}, // To float scale factor
-        1,                                                // bcWidth
-        1,                                                // bcHeight
-    },
-
-    // B5G5R5X1_UNORM_SRGB (0x11B)
-    {
-        "B5G5R5X1_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
-        {0, 0, 0, 0x3f800000},                            // Defaults for missing components
-        {2, 1, 0, 3},                                     // Swizzle
-        {5, 5, 5, 1},                                     // Bits per component
-        16,                                               // Bits per element
-        2,                                                // Bytes per element
-        4,                                                // Num components
-        true,                                             // isSRGB
-        false,                                            // isBC
-        false,                                            // isSubsampled
-        false,                                            // isLuminance
-        {true, true, true, false},                        // Is normalized?
-        {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f}, // To float scale factor
-        1,                                                // bcWidth
-        1,                                                // bcHeight
-    },
-
-    // R8G8_SSCALED (0x11C)
-    {
-        "R8G8_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {8, 8, 0, 0},                 // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R8G8_USCALED (0x11D)
-    {
-        "R8G8_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 0, 0},                 // Swizzle
-        {8, 8, 0, 0},                 // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16_SSCALED (0x11E)
-    {
-        "R16_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {16, 0, 0, 0},                // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16_USCALED (0x11F)
-    {
-        "R16_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {16, 0, 0, 0},                // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x120)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x121)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x122)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x123)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // A1B5G5R5_UNORM (0x124)
-    {
-        "A1B5G5R5_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},                                   // Defaults for missing components
-        {3, 2, 1, 0},                                            // Swizzle
-        {1, 5, 5, 5},                                            // Bits per component
-        16,                                                      // Bits per element
-        2,                                                       // Bytes per element
-        4,                                                       // Num components
-        false,                                                   // isSRGB
-        false,                                                   // isBC
-        false,                                                   // isSubsampled
-        false,                                                   // isLuminance
-        {true, true, true, true},                                // Is normalized?
-        {1.0f / 1.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f}, // To float scale factor
-        1,                                                       // bcWidth
-        1,                                                       // bcHeight
-    },
-
-    // A4B4G4R4_UNORM (0x125)
-    {
-        "A4B4G4R4_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
-        {0, 0, 0, 0x3f800000},                                    // Defaults for missing components
-        {3, 2, 1, 0},                                             // Swizzle
-        {4, 4, 4, 4},                                             // Bits per component
-        16,                                                       // Bits per element
-        2,                                                        // Bytes per element
-        4,                                                        // Num components
-        false,                                                    // isSRGB
-        false,                                                    // isBC
-        false,                                                    // isSubsampled
-        false,                                                    // isLuminance
-        {true, true, true, true},                                 // Is normalized?
-        {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor
-        1,                                                        // bcWidth
-        1,                                                        // bcHeight
-    },
-
-    // L8A8_UINT (0x126)
-    {
-        "L8A8_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 3, 0, 0},                 // Swizzle
-        {8, 8, 0, 0},                 // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // L8A8_SINT (0x127)
-    {
-        "L8A8_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 3, 0, 0},                 // Swizzle
-        {8, 8, 0, 0},                 // Bits per component
-        16,                           // Bits per element
-        2,                            // Bytes per element
-        2,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 0, 0},           // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x128)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x129)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x12A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x12B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x12C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x12D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x12E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x12F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x130)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x131)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x132)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x133)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x134)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x135)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x136)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x137)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x138)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x139)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x13A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x13B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x13C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x13D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x13E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x13F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R8_UNORM (0x140)
-    {
-        "R8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 0, 0, 0},                // Swizzle
-        {8, 0, 0, 0},                // Bits per component
-        8,                           // Bits per element
-        1,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // R8_SNORM (0x141)
-    {
-        "R8_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 0, 0, 0},                // Swizzle
-        {8, 0, 0, 0},                // Bits per component
-        8,                           // Bits per element
-        1,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 127.0f, 0, 0, 0},    // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // R8_SINT (0x142)
-    {
-        "R8_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {8, 0, 0, 0},                 // Bits per component
-        8,                            // Bits per element
-        1,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R8_UINT (0x143)
-    {
-        "R8_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {8, 0, 0, 0},                 // Bits per component
-        8,                            // Bits per element
-        1,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // A8_UNORM (0x144)
-    {
-        "A8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {3, 0, 0, 0},                // Swizzle
-        {8, 0, 0, 0},                // Bits per component
-        8,                           // Bits per element
-        1,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // I8_UNORM (0x145)
-    {
-        "I8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 0, 0, 0},                // Swizzle
-        {8, 0, 0, 0},                // Bits per component
-        8,                           // Bits per element
-        1,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        true,                        // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // L8_UNORM (0x146)
-    {
-        "L8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 0, 0, 0},                // Swizzle
-        {8, 0, 0, 0},                // Bits per component
-        8,                           // Bits per element
-        1,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        true,                        // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // padding (0x147)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x148)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R8_SSCALED (0x149)
-    {
-        "R8_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {8, 0, 0, 0},                 // Bits per component
-        8,                            // Bits per element
-        1,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R8_USCALED (0x14A)
-    {
-        "R8_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {8, 0, 0, 0},                 // Bits per component
-        8,                            // Bits per element
-        1,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x14B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // L8_UNORM_SRGB (0x14C)
-    {
-        "L8_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 0, 0, 0},                // Swizzle
-        {8, 0, 0, 0},                // Bits per component
-        8,                           // Bits per element
-        1,                           // Bytes per element
-        1,                           // Num components
-        true,                        // isSRGB
-        false,                       // isBC
-        false,                       // isSubsampled
-        true,                        // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        1,                           // bcWidth
-        1,                           // bcHeight
-    },
-
-    // padding (0x14D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x14E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x14F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x150)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x151)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // L8_UINT (0x152)
-    {
-        "L8_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {8, 0, 0, 0},                 // Bits per component
-        8,                            // Bits per element
-        1,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // L8_SINT (0x153)
-    {
-        "L8_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {8, 0, 0, 0},                 // Bits per component
-        8,                            // Bits per element
-        1,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // I8_UINT (0x154)
-    {
-        "I8_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {8, 0, 0, 0},                 // Bits per component
-        8,                            // Bits per element
-        1,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // I8_SINT (0x155)
-    {
-        "I8_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {8, 0, 0, 0},                 // Bits per component
-        8,                            // Bits per element
-        1,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        true,                         // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x156)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x157)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x158)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x159)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x15A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x15B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x15C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x15D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x15E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x15F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x160)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x161)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x162)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x163)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x164)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x165)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x166)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x167)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x168)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x169)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x16A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x16B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x16C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x16D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x16E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x16F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x170)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x171)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x172)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x173)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x174)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x175)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x176)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x177)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x178)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x179)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x17A)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x17B)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x17C)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x17D)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x17E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x17F)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // DXT1_RGB_SRGB (0x180)
-    {
-        "DXT1_RGB_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        64,                          // Bits per element
-        8,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // padding (0x181)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x182)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // YCRCB_SWAPUVY (0x183)
-    {
-        "YCRCB_SWAPUVY",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {8, 8, 8, 8},                 // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        true,                         // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        2,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x184)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x185)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // BC1_UNORM (0x186)
-    {
-        "BC1_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        64,                          // Bits per element
-        8,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC2_UNORM (0x187)
-    {
-        "BC2_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        128,                         // Bits per element
-        16,                          // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC3_UNORM (0x188)
-    {
-        "BC3_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        128,                         // Bits per element
-        16,                          // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC4_UNORM (0x189)
-    {
-        "BC4_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        64,                          // Bits per element
-        8,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC5_UNORM (0x18A)
-    {
-        "BC5_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        128,                         // Bits per element
-        16,                          // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC1_UNORM_SRGB (0x18B)
-    {
-        "BC1_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        64,                          // Bits per element
-        8,                           // Bytes per element
-        1,                           // Num components
-        true,                        // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC2_UNORM_SRGB (0x18C)
-    {
-        "BC2_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        128,                         // Bits per element
-        16,                          // Bytes per element
-        1,                           // Num components
-        true,                        // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC3_UNORM_SRGB (0x18D)
-    {
-        "BC3_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        128,                         // Bits per element
-        16,                          // Bytes per element
-        1,                           // Num components
-        true,                        // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // padding (0x18E)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // YCRCB_SWAPUV (0x18F)
-    {
-        "YCRCB_SWAPUV",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {8, 8, 8, 8},                 // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        true,                         // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        2,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x190)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // DXT1_RGB (0x191)
-    {
-        "DXT1_RGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        64,                          // Bits per element
-        8,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // padding (0x192)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R8G8B8_UNORM (0x193)
-    {
-        "R8G8B8_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                            // Defaults for missing components
-        {0, 1, 2, 0},                                     // Swizzle
-        {8, 8, 8, 0},                                     // Bits per component
-        24,                                               // Bits per element
-        3,                                                // Bytes per element
-        3,                                                // Num components
-        false,                                            // isSRGB
-        false,                                            // isBC
-        false,                                            // isSubsampled
-        false,                                            // isLuminance
-        {true, true, true, false},                        // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0}, // To float scale factor
-        1,                                                // bcWidth
-        1,                                                // bcHeight
-    },
-
-    // R8G8B8_SNORM (0x194)
-    {
-        "R8G8B8_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                            // Defaults for missing components
-        {0, 1, 2, 0},                                     // Swizzle
-        {8, 8, 8, 0},                                     // Bits per component
-        24,                                               // Bits per element
-        3,                                                // Bytes per element
-        3,                                                // Num components
-        false,                                            // isSRGB
-        false,                                            // isBC
-        false,                                            // isSubsampled
-        false,                                            // isLuminance
-        {true, true, true, false},                        // Is normalized?
-        {1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0}, // To float scale factor
-        1,                                                // bcWidth
-        1,                                                // bcHeight
-    },
-
-    // R8G8B8_SSCALED (0x195)
-    {
-        "R8G8B8_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {8, 8, 8, 0},                 // Bits per component
-        24,                           // Bits per element
-        3,                            // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R8G8B8_USCALED (0x196)
-    {
-        "R8G8B8_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {8, 8, 8, 0},                 // Bits per component
-        24,                           // Bits per element
-        3,                            // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R64G64B64A64_FLOAT (0x197)
-    {
-        "R64G64B64A64_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {64, 64, 64, 64},             // Bits per component
-        256,                          // Bits per element
-        32,                           // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R64G64B64_FLOAT (0x198)
-    {
-        "R64G64B64_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {64, 64, 64, 0},              // Bits per component
-        192,                          // Bits per element
-        24,                           // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // BC4_SNORM (0x199)
-    {
-        "BC4_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        64,                          // Bits per element
-        8,                           // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 127.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC5_SNORM (0x19A)
-    {
-        "BC5_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        128,                         // Bits per element
-        16,                          // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 127.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // R16G16B16_FLOAT (0x19B)
-    {
-        "R16G16B16_FLOAT",
-        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {16, 16, 16, 0},              // Bits per component
-        48,                           // Bits per element
-        6,                            // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16B16_UNORM (0x19C)
-    {
-        "R16G16B16_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                                  // Defaults for missing components
-        {0, 1, 2, 0},                                           // Swizzle
-        {16, 16, 16, 0},                                        // Bits per component
-        48,                                                     // Bits per element
-        6,                                                      // Bytes per element
-        3,                                                      // Num components
-        false,                                                  // isSRGB
-        false,                                                  // isBC
-        false,                                                  // isSubsampled
-        false,                                                  // isLuminance
-        {true, true, true, false},                              // Is normalized?
-        {1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0}, // To float scale factor
-        1,                                                      // bcWidth
-        1,                                                      // bcHeight
-    },
-
-    // R16G16B16_SNORM (0x19D)
-    {
-        "R16G16B16_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                                  // Defaults for missing components
-        {0, 1, 2, 0},                                           // Swizzle
-        {16, 16, 16, 0},                                        // Bits per component
-        48,                                                     // Bits per element
-        6,                                                      // Bytes per element
-        3,                                                      // Num components
-        false,                                                  // isSRGB
-        false,                                                  // isBC
-        false,                                                  // isSubsampled
-        false,                                                  // isLuminance
-        {true, true, true, false},                              // Is normalized?
-        {1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0}, // To float scale factor
-        1,                                                      // bcWidth
-        1,                                                      // bcHeight
-    },
-
-    // R16G16B16_SSCALED (0x19E)
-    {
-        "R16G16B16_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {16, 16, 16, 0},              // Bits per component
-        48,                           // Bits per element
-        6,                            // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16B16_USCALED (0x19F)
-    {
-        "R16G16B16_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {16, 16, 16, 0},              // Bits per component
-        48,                           // Bits per element
-        6,                            // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x1A0)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // BC6H_SF16 (0x1A1)
-    {
-        "BC6H_SF16",
-        {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        128,                         // Bits per element
-        16,                          // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 127.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC7_UNORM (0x1A2)
-    {
-        "BC7_UNORM",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        128,                         // Bits per element
-        16,                          // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC7_UNORM_SRGB (0x1A3)
-    {
-        "BC7_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        128,                         // Bits per element
-        16,                          // Bytes per element
-        1,                           // Num components
-        true,                        // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // BC6H_UF16 (0x1A4)
-    {
-        "BC6H_UF16",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},       // Defaults for missing components
-        {0, 1, 2, 3},                // Swizzle
-        {8, 8, 8, 8},                // Bits per component
-        128,                         // Bits per element
-        16,                          // Bytes per element
-        1,                           // Num components
-        false,                       // isSRGB
-        true,                        // isBC
-        false,                       // isSubsampled
-        false,                       // isLuminance
-        {true, false, false, false}, // Is normalized?
-        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
-        4,                           // bcWidth
-        4,                           // bcHeight
-    },
-
-    // padding (0x1A5)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1A6)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1A7)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R8G8B8_UNORM_SRGB (0x1A8)
-    {
-        "R8G8B8_UNORM_SRGB",
-        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},                            // Defaults for missing components
-        {0, 1, 2, 0},                                     // Swizzle
-        {8, 8, 8, 0},                                     // Bits per component
-        24,                                               // Bits per element
-        3,                                                // Bytes per element
-        3,                                                // Num components
-        true,                                             // isSRGB
-        false,                                            // isBC
-        false,                                            // isSubsampled
-        false,                                            // isLuminance
-        {true, true, true, false},                        // Is normalized?
-        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0}, // To float scale factor
-        1,                                                // bcWidth
-        1,                                                // bcHeight
-    },
-
-    // padding (0x1A9)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1AA)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1AB)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1AC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1AD)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1AE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1AF)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R16G16B16_UINT (0x1B0)
-    {
-        "R16G16B16_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {16, 16, 16, 0},              // Bits per component
-        48,                           // Bits per element
-        6,                            // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R16G16B16_SINT (0x1B1)
-    {
-        "R16G16B16_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {16, 16, 16, 0},              // Bits per component
-        48,                           // Bits per element
-        6,                            // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R32_SFIXED (0x1B2)
-    {
-        "R32_SFIXED",
-        {SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 0, 0, 0},                 // Swizzle
-        {32, 0, 0, 0},                // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R10G10B10A2_SNORM (0x1B3)
-    {
-        "R10G10B10A2_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {0, 1, 2, 3},             // Swizzle
-        {10, 10, 10, 2},          // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        false,                    // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f}, // To float scale factor
-        1,                                                          // bcWidth
-        1,                                                          // bcHeight
-    },
-
-    // R10G10B10A2_USCALED (0x1B4)
-    {
-        "R10G10B10A2_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {10, 10, 10, 2},              // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R10G10B10A2_SSCALED (0x1B5)
-    {
-        "R10G10B10A2_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {10, 10, 10, 2},              // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R10G10B10A2_SINT (0x1B6)
-    {
-        "R10G10B10A2_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {10, 10, 10, 2},              // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // B10G10R10A2_SNORM (0x1B7)
-    {
-        "B10G10R10A2_SNORM",
-        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM},
-        {0, 0, 0, 0x3f800000},    // Defaults for missing components
-        {2, 1, 0, 3},             // Swizzle
-        {10, 10, 10, 2},          // Bits per component
-        32,                       // Bits per element
-        4,                        // Bytes per element
-        4,                        // Num components
-        false,                    // isSRGB
-        false,                    // isBC
-        false,                    // isSubsampled
-        false,                    // isLuminance
-        {true, true, true, true}, // Is normalized?
-        {1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f}, // To float scale factor
-        1,                                                          // bcWidth
-        1,                                                          // bcHeight
-    },
-
-    // B10G10R10A2_USCALED (0x1B8)
-    {
-        "B10G10R10A2_USCALED",
-        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {2, 1, 0, 3},                 // Swizzle
-        {10, 10, 10, 2},              // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // B10G10R10A2_SSCALED (0x1B9)
-    {
-        "B10G10R10A2_SSCALED",
-        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED},
-        {0, 0, 0, 0x3f800000},        // Defaults for missing components
-        {2, 1, 0, 3},                 // Swizzle
-        {10, 10, 10, 2},              // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // B10G10R10A2_UINT (0x1BA)
-    {
-        "B10G10R10A2_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {2, 1, 0, 3},                 // Swizzle
-        {10, 10, 10, 2},              // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // B10G10R10A2_SINT (0x1BB)
-    {
-        "B10G10R10A2_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {2, 1, 0, 3},                 // Swizzle
-        {10, 10, 10, 2},              // Bits per component
-        32,                           // Bits per element
-        4,                            // Bytes per element
-        4,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x1BC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1BD)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1BE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1BF)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1C0)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1C1)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1C2)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1C3)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1C4)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1C5)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1C6)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1C7)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // R8G8B8_UINT (0x1C8)
-    {
-        "R8G8B8_UINT",
-        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {8, 8, 8, 0},                 // Bits per component
-        24,                           // Bits per element
-        3,                            // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // R8G8B8_SINT (0x1C9)
-    {
-        "R8G8B8_SINT",
-        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 0},                 // Swizzle
-        {8, 8, 8, 0},                 // Bits per component
-        24,                           // Bits per element
-        3,                            // Bytes per element
-        3,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-
-    // padding (0x1CA)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1CB)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1CC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1CD)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1CE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1CF)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D0)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D1)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D2)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D3)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D4)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D5)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D6)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D7)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D8)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1D9)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1DA)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1DB)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1DC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1DD)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1DE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1DF)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E0)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E1)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E2)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E3)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E4)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E5)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E6)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E7)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E8)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1E9)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1EA)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1EB)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1EC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1ED)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1EE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1EF)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F0)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F1)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F2)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F3)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F4)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F5)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F6)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F7)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F8)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1F9)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1FA)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1FB)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1FC)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1FD)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // padding (0x1FE)
-    {nullptr,
-     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     {0, 0, 0, 0},
-     0,
-     0,
-     0,
-     false,
-     false,
-     false,
-     false,
-     {false, false, false, false},
-     {0.0f, 0.0f, 0.0f, 0.0f},
-     1,
-     1},
-    // RAW (0x1FF)
-    {
-        "RAW",
-        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
-        {0, 0, 0, 0x1},               // Defaults for missing components
-        {0, 1, 2, 3},                 // Swizzle
-        {8, 0, 0, 0},                 // Bits per component
-        8,                            // Bits per element
-        1,                            // Bytes per element
-        1,                            // Num components
-        false,                        // isSRGB
-        false,                        // isBC
-        false,                        // isSubsampled
-        false,                        // isLuminance
-        {false, false, false, false}, // Is normalized?
-        {1.0f, 0, 0, 0},              // To float scale factor
-        1,                            // bcWidth
-        1,                            // bcHeight
-    },
-};
diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h
deleted file mode 100644
index b7a3e533d15..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/formats.h
+++ /dev/null
@@ -1,268 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file formats.h
- *
- * @brief auto-generated file
- *
- * DO NOT EDIT
- *
- ******************************************************************************/
-
-#pragma once
-
-#include "common/os.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_TYPE - Format component type
-//////////////////////////////////////////////////////////////////////////
-enum SWR_TYPE
-{
-    SWR_TYPE_UNKNOWN,
-    SWR_TYPE_UNUSED,
-    SWR_TYPE_UNORM,
-    SWR_TYPE_SNORM,
-    SWR_TYPE_UINT,
-    SWR_TYPE_SINT,
-    SWR_TYPE_FLOAT,
-    SWR_TYPE_SSCALED,
-    SWR_TYPE_USCALED,
-    SWR_TYPE_SFIXED,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_FORMAT
-//////////////////////////////////////////////////////////////////////////
-enum SWR_FORMAT
-{
-    R32G32B32A32_FLOAT       = 0x0,
-    R32G32B32A32_SINT        = 0x1,
-    R32G32B32A32_UINT        = 0x2,
-    R64G64_FLOAT             = 0x5,
-    R32G32B32X32_FLOAT       = 0x6,
-    R32G32B32A32_SSCALED     = 0x7,
-    R32G32B32A32_USCALED     = 0x8,
-    R32G32B32A32_SFIXED      = 0x20,
-    R32G32B32_FLOAT          = 0x40,
-    R32G32B32_SINT           = 0x41,
-    R32G32B32_UINT           = 0x42,
-    R32G32B32_SSCALED        = 0x45,
-    R32G32B32_USCALED        = 0x46,
-    R32G32B32_SFIXED         = 0x50,
-    R16G16B16A16_UNORM       = 0x80,
-    R16G16B16A16_SNORM       = 0x81,
-    R16G16B16A16_SINT        = 0x82,
-    R16G16B16A16_UINT        = 0x83,
-    R16G16B16A16_FLOAT       = 0x84,
-    R32G32_FLOAT             = 0x85,
-    R32G32_SINT              = 0x86,
-    R32G32_UINT              = 0x87,
-    R32_FLOAT_X8X24_TYPELESS = 0x88,
-    X32_TYPELESS_G8X24_UINT  = 0x89,
-    L32A32_FLOAT             = 0x8A,
-    R64_FLOAT                = 0x8D,
-    R16G16B16X16_UNORM       = 0x8E,
-    R16G16B16X16_FLOAT       = 0x8F,
-    L32X32_FLOAT             = 0x91,
-    I32X32_FLOAT             = 0x92,
-    R16G16B16A16_SSCALED     = 0x93,
-    R16G16B16A16_USCALED     = 0x94,
-    R32G32_SSCALED           = 0x95,
-    R32G32_USCALED           = 0x96,
-    R32G32_SFIXED            = 0xA0,
-    B8G8R8A8_UNORM           = 0xC0,
-    B8G8R8A8_UNORM_SRGB      = 0xC1,
-    R10G10B10A2_UNORM        = 0xC2,
-    R10G10B10A2_UNORM_SRGB   = 0xC3,
-    R10G10B10A2_UINT         = 0xC4,
-    R8G8B8A8_UNORM           = 0xC7,
-    R8G8B8A8_UNORM_SRGB      = 0xC8,
-    R8G8B8A8_SNORM           = 0xC9,
-    R8G8B8A8_SINT            = 0xCA,
-    R8G8B8A8_UINT            = 0xCB,
-    R16G16_UNORM             = 0xCC,
-    R16G16_SNORM             = 0xCD,
-    R16G16_SINT              = 0xCE,
-    R16G16_UINT              = 0xCF,
-    R16G16_FLOAT             = 0xD0,
-    B10G10R10A2_UNORM        = 0xD1,
-    B10G10R10A2_UNORM_SRGB   = 0xD2,
-    R11G11B10_FLOAT          = 0xD3,
-    R10G10B10_FLOAT_A2_UNORM = 0xD5,
-    R32_SINT                 = 0xD6,
-    R32_UINT                 = 0xD7,
-    R32_FLOAT                = 0xD8,
-    R24_UNORM_X8_TYPELESS    = 0xD9,
-    X24_TYPELESS_G8_UINT     = 0xDA,
-    L32_UNORM                = 0xDD,
-    L16A16_UNORM             = 0xDF,
-    I24X8_UNORM              = 0xE0,
-    L24X8_UNORM              = 0xE1,
-    I32_FLOAT                = 0xE3,
-    L32_FLOAT                = 0xE4,
-    A32_FLOAT                = 0xE5,
-    B8G8R8X8_UNORM           = 0xE9,
-    B8G8R8X8_UNORM_SRGB      = 0xEA,
-    R8G8B8X8_UNORM           = 0xEB,
-    R8G8B8X8_UNORM_SRGB      = 0xEC,
-    R9G9B9E5_SHAREDEXP       = 0xED,
-    B10G10R10X2_UNORM        = 0xEE,
-    L16A16_FLOAT             = 0xF0,
-    R10G10B10X2_USCALED      = 0xF3,
-    R8G8B8A8_SSCALED         = 0xF4,
-    R8G8B8A8_USCALED         = 0xF5,
-    R16G16_SSCALED           = 0xF6,
-    R16G16_USCALED           = 0xF7,
-    R32_SSCALED              = 0xF8,
-    R32_USCALED              = 0xF9,
-    B5G6R5_UNORM             = 0x100,
-    B5G6R5_UNORM_SRGB        = 0x101,
-    B5G5R5A1_UNORM           = 0x102,
-    B5G5R5A1_UNORM_SRGB      = 0x103,
-    B4G4R4A4_UNORM           = 0x104,
-    B4G4R4A4_UNORM_SRGB      = 0x105,
-    R8G8_UNORM               = 0x106,
-    R8G8_SNORM               = 0x107,
-    R8G8_SINT                = 0x108,
-    R8G8_UINT                = 0x109,
-    R16_UNORM                = 0x10A,
-    R16_SNORM                = 0x10B,
-    R16_SINT                 = 0x10C,
-    R16_UINT                 = 0x10D,
-    R16_FLOAT                = 0x10E,
-    I16_UNORM                = 0x111,
-    L16_UNORM                = 0x112,
-    A16_UNORM                = 0x113,
-    L8A8_UNORM               = 0x114,
-    I16_FLOAT                = 0x115,
-    L16_FLOAT                = 0x116,
-    A16_FLOAT                = 0x117,
-    L8A8_UNORM_SRGB          = 0x118,
-    B5G5R5X1_UNORM           = 0x11A,
-    B5G5R5X1_UNORM_SRGB      = 0x11B,
-    R8G8_SSCALED             = 0x11C,
-    R8G8_USCALED             = 0x11D,
-    R16_SSCALED              = 0x11E,
-    R16_USCALED              = 0x11F,
-    A1B5G5R5_UNORM           = 0x124,
-    A4B4G4R4_UNORM           = 0x125,
-    L8A8_UINT                = 0x126,
-    L8A8_SINT                = 0x127,
-    R8_UNORM                 = 0x140,
-    R8_SNORM                 = 0x141,
-    R8_SINT                  = 0x142,
-    R8_UINT                  = 0x143,
-    A8_UNORM                 = 0x144,
-    I8_UNORM                 = 0x145,
-    L8_UNORM                 = 0x146,
-    R8_SSCALED               = 0x149,
-    R8_USCALED               = 0x14A,
-    L8_UNORM_SRGB            = 0x14C,
-    L8_UINT                  = 0x152,
-    L8_SINT                  = 0x153,
-    I8_UINT                  = 0x154,
-    I8_SINT                  = 0x155,
-    DXT1_RGB_SRGB            = 0x180,
-    YCRCB_SWAPUVY            = 0x183,
-    BC1_UNORM                = 0x186,
-    BC2_UNORM                = 0x187,
-    BC3_UNORM                = 0x188,
-    BC4_UNORM                = 0x189,
-    BC5_UNORM                = 0x18A,
-    BC1_UNORM_SRGB           = 0x18B,
-    BC2_UNORM_SRGB           = 0x18C,
-    BC3_UNORM_SRGB           = 0x18D,
-    YCRCB_SWAPUV             = 0x18F,
-    DXT1_RGB                 = 0x191,
-    R8G8B8_UNORM             = 0x193,
-    R8G8B8_SNORM             = 0x194,
-    R8G8B8_SSCALED           = 0x195,
-    R8G8B8_USCALED           = 0x196,
-    R64G64B64A64_FLOAT       = 0x197,
-    R64G64B64_FLOAT          = 0x198,
-    BC4_SNORM                = 0x199,
-    BC5_SNORM                = 0x19A,
-    R16G16B16_FLOAT          = 0x19B,
-    R16G16B16_UNORM          = 0x19C,
-    R16G16B16_SNORM          = 0x19D,
-    R16G16B16_SSCALED        = 0x19E,
-    R16G16B16_USCALED        = 0x19F,
-    BC6H_SF16                = 0x1A1,
-    BC7_UNORM                = 0x1A2,
-    BC7_UNORM_SRGB           = 0x1A3,
-    BC6H_UF16                = 0x1A4,
-    R8G8B8_UNORM_SRGB        = 0x1A8,
-    R16G16B16_UINT           = 0x1B0,
-    R16G16B16_SINT           = 0x1B1,
-    R32_SFIXED               = 0x1B2,
-    R10G10B10A2_SNORM        = 0x1B3,
-    R10G10B10A2_USCALED      = 0x1B4,
-    R10G10B10A2_SSCALED      = 0x1B5,
-    R10G10B10A2_SINT         = 0x1B6,
-    B10G10R10A2_SNORM        = 0x1B7,
-    B10G10R10A2_USCALED      = 0x1B8,
-    B10G10R10A2_SSCALED      = 0x1B9,
-    B10G10R10A2_UINT         = 0x1BA,
-    B10G10R10A2_SINT         = 0x1BB,
-    R8G8B8_UINT              = 0x1C8,
-    R8G8B8_SINT              = 0x1C9,
-    RAW                      = 0x1FF,
-    NUM_SWR_FORMATS          = 0x200,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_FORMAT_INFO - Format information
-//////////////////////////////////////////////////////////////////////////
-struct SWR_FORMAT_INFO
-{
-    const char* name;
-    SWR_TYPE    type[4];
-    uint32_t    defaults[4];
-    uint32_t    swizzle[4]; ///< swizzle per component
-    uint32_t    bpc[4];     ///< bits per component
-    uint32_t    bpp;        ///< bits per pixel
-    uint32_t    Bpp;        ///< bytes per pixel
-    uint32_t    numComps;   ///< number of components
-    bool        isSRGB;
-    bool        isBC;
-    bool        isSubsampled;
-    bool        isLuminance;
-    bool        isNormalized[4];
-    float       toFloat[4];
-    uint32_t    bcWidth;
-    uint32_t    bcHeight;
-};
-
-extern const SWR_FORMAT_INFO gFormatInfo[NUM_SWR_FORMATS];
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves format info struct for given format.
-/// @param format - SWR format
-INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format)
-{
-    SWR_ASSERT(format < NUM_SWR_FORMATS, "Invalid Surface Format: %d", format);
-    SWR_ASSERT(gFormatInfo[format].name != nullptr, "Invalid Surface Format: %d", format);
-    return gFormatInfo[format];
-}
-
-// lookup table for unorm8 srgb -> float conversion
-extern const uint32_t srgb8Table[256];
diff --git a/src/gallium/drivers/swr/rasterizer/common/intrin.h b/src/gallium/drivers/swr/rasterizer/common/intrin.h
deleted file mode 100644
index 95b462b1e36..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/intrin.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#ifndef __SWR_INTRIN_H__
-#define __SWR_INTRIN_H__
-
-#include "os.h"
-
-#if !defined(SIMD_ARCH)
-#define SIMD_ARCH KNOB_ARCH
-#endif
-
-#include "simdlib_types.hpp"
-
-typedef SIMDImpl::SIMD128Impl::Float   simd4scalar;
-typedef SIMDImpl::SIMD128Impl::Double  simd4scalard;
-typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
-typedef SIMDImpl::SIMD128Impl::Vec4    simd4vector;
-typedef SIMDImpl::SIMD128Impl::Mask    simd4mask;
-
-typedef SIMDImpl::SIMD256Impl::Float   simd8scalar;
-typedef SIMDImpl::SIMD256Impl::Double  simd8scalard;
-typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
-typedef SIMDImpl::SIMD256Impl::Vec4    simd8vector;
-typedef SIMDImpl::SIMD256Impl::Mask    simd8mask;
-
-typedef SIMDImpl::SIMD512Impl::Float   simd16scalar;
-typedef SIMDImpl::SIMD512Impl::Double  simd16scalard;
-typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
-typedef SIMDImpl::SIMD512Impl::Vec4    simd16vector;
-typedef SIMDImpl::SIMD512Impl::Mask    simd16mask;
-
-#if KNOB_SIMD_WIDTH == 8
-typedef simd8scalar  simdscalar;
-typedef simd8scalard simdscalard;
-typedef simd8scalari simdscalari;
-typedef simd8vector  simdvector;
-typedef simd8mask    simdmask;
-#else
-#error Unsupported vector width
-#endif
-
-INLINE
-UINT pdep_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH >= KNOB_ARCH_AVX2
-    return _pdep_u32(a, mask);
-#else
-    UINT result = 0;
-
-    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
-    // using bsf instead of funky loop
-    unsigned long maskIndex = 0;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. populate LSB from src
-        const UINT LSB = (UINT)((int)(a << 31) >> 31);
-
-        // 3. copy bit from mask
-        result |= LSB & lowest;
-
-        // 4. clear lowest bit
-        mask &= ~lowest;
-
-        // 5. prepare for next iteration
-        a >>= 1;
-    }
-
-    return result;
-#endif
-}
-
-INLINE
-UINT pext_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH >= KNOB_ARCH_AVX2
-    return _pext_u32(a, mask);
-#else
-    UINT     result = 0;
-    unsigned long maskIndex;
-    uint32_t currentBit = 0;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. copy bit from mask
-        result |= ((a & lowest) > 0) << currentBit++;
-
-        // 3. clear lowest bit
-        mask &= ~lowest;
-    }
-    return result;
-#endif
-}
-
-#endif //__SWR_INTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/isa.hpp b/src/gallium/drivers/swr/rasterizer/common/isa.hpp
deleted file mode 100644
index 41af0055f1e..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/isa.hpp
+++ /dev/null
@@ -1,231 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#pragma once
-
-#include <iostream>
-#include <vector>
-#include <bitset>
-#include <array>
-#include <string>
-#include <algorithm>
-
-// Clang for Windows does supply an intrin.h with __cpuid intrinsics, however...
-// It seems to not realize that a write to "b" (ebx) will kill the value in rbx.
-// This attempts to use the "native" clang / gcc intrinsics instead of the windows
-// compatible ones.
-#if defined(_MSC_VER) && !defined(__clang__)
-#include <intrin.h>
-#else
-#include <string.h>
-#if !defined(__cpuid)
-#include <cpuid.h>
-#endif
-#endif
-
-class InstructionSet
-{
-public:
-    InstructionSet() : CPU_Rep(){};
-
-    // getters
-    std::string Vendor(void) { return CPU_Rep.vendor_; }
-    std::string Brand(void) { return CPU_Rep.brand_; }
-
-    bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; }
-    bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; }
-    bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; }
-    bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; }
-    bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; }
-    bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; }
-    bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; }
-    bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; }
-    bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; }
-    bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; }
-    bool AES(void) { return CPU_Rep.f_1_ECX_[25]; }
-    bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; }
-    bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; }
-    bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; }
-
-    bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; }
-    bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; }
-    bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; }
-    bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; }
-    bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; }
-    bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; }
-    bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; }
-    bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; }
-    bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; }
-
-    bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; }
-    bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; }
-    bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; }
-    bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; }
-    bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; }
-    bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; }
-    bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; }
-    bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; }
-    bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; }
-    bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; }
-
-    bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; }
-
-    bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; }
-    bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; }
-    bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; }
-    bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; }
-    bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; }
-    bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; }
-
-    bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; }
-    bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; }
-    bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; }
-    bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; }
-    bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; }
-
-    bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; }
-    bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; }
-    bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; }
-    bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; }
-    bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; }
-    bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; }
-    bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; }
-
-private:
-    class InstructionSet_Internal
-    {
-    public:
-        InstructionSet_Internal() :
-            nIds_{0}, nExIds_{0}, isIntel_{false}, isAMD_{false}, f_1_ECX_{0}, f_1_EDX_{0},
-            f_7_EBX_{0}, f_7_ECX_{0}, f_81_ECX_{0}, f_81_EDX_{0}, data_{}, extdata_{}
-        {
-            // int cpuInfo[4] = {-1};
-            std::array<int, 4> cpui;
-
-            // Calling __cpuid with 0x0 as the function_id argument
-            // gets the number of the highest valid function ID.
-#if defined(_MSC_VER) && !defined(__clang__)
-            __cpuid(cpui.data(), 0);
-            nIds_ = cpui[0];
-#else
-            nIds_ = __get_cpuid_max(0, NULL);
-#endif
-
-            for (int i = 0; i <= nIds_; ++i)
-            {
-#if defined(_MSC_VER) && !defined(__clang__)
-                __cpuidex(cpui.data(), i, 0);
-#else
-                int* data = cpui.data();
-                __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
-#endif
-                data_.push_back(cpui);
-            }
-
-            // Capture vendor string
-            char vendor[0x20];
-            memset(vendor, 0, sizeof(vendor));
-            *reinterpret_cast<int*>(vendor)     = data_[0][1];
-            *reinterpret_cast<int*>(vendor + 4) = data_[0][3];
-            *reinterpret_cast<int*>(vendor + 8) = data_[0][2];
-            vendor_                             = vendor;
-            if (vendor_ == "GenuineIntel")
-            {
-                isIntel_ = true;
-            }
-            else if (vendor_ == "AuthenticAMD")
-            {
-                isAMD_ = true;
-            }
-
-            // load bitset with flags for function 0x00000001
-            if (nIds_ >= 1)
-            {
-                f_1_ECX_ = data_[1][2];
-                f_1_EDX_ = data_[1][3];
-            }
-
-            // load bitset with flags for function 0x00000007
-            if (nIds_ >= 7)
-            {
-                f_7_EBX_ = data_[7][1];
-                f_7_ECX_ = data_[7][2];
-            }
-
-            // Calling __cpuid with 0x80000000 as the function_id argument
-            // gets the number of the highest valid extended ID.
-#if defined(_MSC_VER) && !defined(__clang__)
-            __cpuid(cpui.data(), 0x80000000);
-            nExIds_ = cpui[0];
-#else
-            nExIds_ = __get_cpuid_max(0x80000000, NULL);
-#endif
-
-            char brand[0x40];
-            memset(brand, 0, sizeof(brand));
-
-            for (unsigned i = 0x80000000; i <= nExIds_; ++i)
-            {
-#if defined(_MSC_VER) && !defined(__clang__)
-                __cpuidex(cpui.data(), i, 0);
-#else
-                int* data = cpui.data();
-                __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
-#endif
-                extdata_.push_back(cpui);
-            }
-
-            // load bitset with flags for function 0x80000001
-            if (nExIds_ >= 0x80000001)
-            {
-                f_81_ECX_ = extdata_[1][2];
-                f_81_EDX_ = extdata_[1][3];
-            }
-
-            // Interpret CPU brand string if reported
-            if (nExIds_ >= 0x80000004)
-            {
-                memcpy(brand, extdata_[2].data(), sizeof(cpui));
-                memcpy(brand + 16, extdata_[3].data(), sizeof(cpui));
-                memcpy(brand + 32, extdata_[4].data(), sizeof(cpui));
-                brand_ = brand;
-            }
-        };
-
-        int                             nIds_;
-        unsigned                        nExIds_;
-        std::string                     vendor_;
-        std::string                     brand_;
-        bool                            isIntel_;
-        bool                            isAMD_;
-        std::bitset<32>                 f_1_ECX_;
-        std::bitset<32>                 f_1_EDX_;
-        std::bitset<32>                 f_7_EBX_;
-        std::bitset<32>                 f_7_ECX_;
-        std::bitset<32>                 f_81_ECX_;
-        std::bitset<32>                 f_81_EDX_;
-        std::vector<std::array<int, 4>> data_;
-        std::vector<std::array<int, 4>> extdata_;
-    };
-    const InstructionSet_Internal CPU_Rep;
-};
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.cpp b/src/gallium/drivers/swr/rasterizer/common/os.cpp
deleted file mode 100644
index 75c7161b4e2..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/os.cpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#include "common/os.h"
-#include <vector>
-#include <array>
-#include <sstream>
-
-#if defined(_WIN32)
-#include <shlobj.h>
-#endif // Windows
-
-#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-#include <pthread.h>
-#endif // Linux
-
-#if defined(_MSC_VER)
-static const DWORD MS_VC_EXCEPTION = 0x406D1388;
-
-#pragma pack(push, 8)
-typedef struct tagTHREADNAME_INFO
-{
-    DWORD  dwType;     // Must be 0x1000.
-    LPCSTR szName;     // Pointer to name (in user addr space).
-    DWORD  dwThreadID; // Thread ID (-1=caller thread).
-    DWORD  dwFlags;    // Reserved for future use, must be zero.
-} THREADNAME_INFO;
-#pragma pack(pop)
-
-void LegacySetThreadName(const char* pThreadName)
-{
-    THREADNAME_INFO info;
-    info.dwType     = 0x1000;
-    info.szName     = pThreadName;
-    info.dwThreadID = GetCurrentThreadId();
-    info.dwFlags    = 0;
-
-    if (!IsDebuggerPresent())
-    {
-        // No debugger attached to interpret exception, no need to actually do it
-        return;
-    }
-
-#pragma warning(push)
-#pragma warning(disable : 6320 6322)
-    __try
-    {
-        RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
-    }
-    __except (EXCEPTION_EXECUTE_HANDLER)
-    {
-    }
-#pragma warning(pop)
-}
-#endif // _WIN32
-
-void SWR_API SetCurrentThreadName(const char* pThreadName)
-{
-#if defined(_MSC_VER)
-    // The SetThreadDescription API was brought in version 1607 of Windows 10.
-    typedef HRESULT(WINAPI * PFNSetThreadDescription)(HANDLE hThread, PCWSTR lpThreadDescription);
-    // The SetThreadDescription API works even if no debugger is attached.
-    auto pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>(
-        GetProcAddress(GetModuleHandleA("Kernel32.dll"), "SetThreadDescription"));
-
-    if (!pfnSetThreadDescription)
-    {
-        // try KernelBase.dll
-        pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>(
-            GetProcAddress(GetModuleHandleA("KernelBase.dll"), "SetThreadDescription"));
-    }
-
-    if (pfnSetThreadDescription)
-    {
-        std::string  utf8Name = pThreadName;
-        std::wstring wideName;
-        wideName.resize(utf8Name.size() + 1);
-        swprintf_s(&(wideName.front()), wideName.size(), L"%S", utf8Name.c_str());
-        HRESULT hr = pfnSetThreadDescription(GetCurrentThread(), wideName.c_str());
-        SWR_ASSERT(SUCCEEDED(hr), "Failed to set thread name to %s", pThreadName);
-
-        // Fall through - it seems like some debuggers only recognize the exception
-    }
-
-    // Fall back to exception based hack
-    LegacySetThreadName(pThreadName);
-#endif // _WIN32
-
-#if defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-    pthread_setname_np(pthread_self(), pThreadName);
-#endif // Linux
-}
-
-#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-static void
-SplitString(std::vector<std::string>& out_segments, const std::string& input, char splitToken)
-{
-    out_segments.clear();
-
-    std::istringstream f(input);
-    std::string        s;
-    while (std::getline(f, s, splitToken))
-    {
-        if (s.size())
-        {
-            out_segments.push_back(s);
-        }
-    }
-}
-#endif // Unix
-
-void SWR_API CreateDirectoryPath(const std::string& path)
-{
-#if defined(_WIN32)
-    SHCreateDirectoryExA(nullptr, path.c_str(), nullptr);
-#endif // Windows
-
-#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-    std::vector<std::string> pathSegments;
-    SplitString(pathSegments, path, '/');
-
-    std::string tmpPath;
-    for (auto const& segment : pathSegments)
-    {
-        tmpPath.push_back('/');
-        tmpPath += segment;
-
-        int result = mkdir(tmpPath.c_str(), 0777);
-        if (result == -1 && errno != EEXIST)
-        {
-            break;
-        }
-    }
-#endif // Unix
-}
-
-/// Execute Command (block until finished)
-/// @returns process exit value
-int SWR_API ExecCmd(const std::string& cmd,     ///< (In) Command line string
-                    const char* pOptEnvStrings, ///< (Optional In) Environment block for new process
-                    std::string*       pOptStdOut, ///< (Optional Out) Standard Output text
-                    std::string*       pOptStdErr, ///< (Optional Out) Standard Error text
-                    const std::string* pOptStdIn)  ///< (Optional In) Standard Input text
-{
-    int rvalue = -1;
-
-#if defined(_WIN32)
-    struct WinPipe
-    {
-        HANDLE hRead;
-        HANDLE hWrite;
-    };
-    std::array<WinPipe, 3> hPipes = {};
-
-    SECURITY_ATTRIBUTES saAttr  = {sizeof(SECURITY_ATTRIBUTES)};
-    saAttr.bInheritHandle       = TRUE; // Pipe handles are inherited by child process.
-    saAttr.lpSecurityDescriptor = NULL;
-
-    {
-        bool bFail = false;
-        for (WinPipe& p : hPipes)
-        {
-            if (!CreatePipe(&p.hRead, &p.hWrite, &saAttr, 0))
-            {
-                bFail = true;
-            }
-        }
-
-        if (bFail)
-        {
-            for (WinPipe& p : hPipes)
-            {
-                CloseHandle(p.hRead);
-                CloseHandle(p.hWrite);
-            }
-            return rvalue;
-        }
-    }
-
-    STARTUPINFOA StartupInfo{};
-    StartupInfo.cb      = sizeof(STARTUPINFOA);
-    StartupInfo.dwFlags = STARTF_USESTDHANDLES;
-    StartupInfo.dwFlags |= STARTF_USESHOWWINDOW;
-    StartupInfo.wShowWindow = SW_HIDE;
-    if (pOptStdIn)
-    {
-        StartupInfo.hStdInput = hPipes[0].hRead;
-    }
-    StartupInfo.hStdOutput = hPipes[1].hWrite;
-    StartupInfo.hStdError  = hPipes[2].hWrite;
-    PROCESS_INFORMATION procInfo{};
-
-    // CreateProcess can modify the string
-    std::string local_cmd = cmd;
-
-    BOOL ProcessValue = CreateProcessA(NULL,
-                                       (LPSTR)local_cmd.c_str(),
-                                       NULL,
-                                       NULL,
-                                       TRUE,
-                                       0,
-                                       (LPVOID)pOptEnvStrings,
-                                       NULL,
-                                       &StartupInfo,
-                                       &procInfo);
-
-    if (ProcessValue && procInfo.hProcess)
-    {
-        auto ReadFromPipe = [](HANDLE hPipe, std::string* pOutStr) {
-            char  buf[1024];
-            DWORD dwRead  = 0;
-            DWORD dwAvail = 0;
-            while (true)
-            {
-                if (!::PeekNamedPipe(hPipe, NULL, 0, NULL, &dwAvail, NULL))
-                {
-                    break;
-                }
-
-                if (!dwAvail) // no data available, return
-                {
-                    break;
-                }
-
-                if (!::ReadFile(hPipe,
-                                buf,
-                                std::min<size_t>(sizeof(buf) - 1, size_t(dwAvail)),
-                                &dwRead,
-                                NULL) ||
-                    !dwRead)
-                {
-                    // error, the child process might ended
-                    break;
-                }
-
-                buf[dwRead] = 0;
-                if (pOutStr)
-                {
-                    (*pOutStr) += buf;
-                }
-            }
-        };
-        bool   bProcessEnded = false;
-        size_t bytesWritten  = 0;
-        do
-        {
-            if (pOptStdIn && (pOptStdIn->size() > bytesWritten))
-            {
-                DWORD bytesToWrite = static_cast<DWORD>(pOptStdIn->size()) - bytesWritten;
-                if (!::WriteFile(hPipes[0].hWrite,
-                                 pOptStdIn->data() + bytesWritten,
-                                 bytesToWrite,
-                                 &bytesToWrite,
-                                 nullptr))
-                {
-                    // Failed to write to pipe
-                    break;
-                }
-                bytesWritten += bytesToWrite;
-            }
-
-            // Give some timeslice (50ms), so we won't waste 100% cpu.
-            bProcessEnded = (WaitForSingleObject(procInfo.hProcess, 50) == WAIT_OBJECT_0);
-
-            ReadFromPipe(hPipes[1].hRead, pOptStdOut);
-            ReadFromPipe(hPipes[2].hRead, pOptStdErr);
-        } while (!bProcessEnded);
-
-        DWORD exitVal = 0;
-        if (!GetExitCodeProcess(procInfo.hProcess, &exitVal))
-        {
-            exitVal = 1;
-        }
-
-        CloseHandle(procInfo.hProcess);
-        CloseHandle(procInfo.hThread);
-
-        rvalue = exitVal;
-    }
-
-    for (WinPipe& p : hPipes)
-    {
-        CloseHandle(p.hRead);
-        CloseHandle(p.hWrite);
-    }
-
-#else
-
-    // Non-Windows implementation
-
-#endif
-
-    return rvalue;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
deleted file mode 100644
index ed42e1eb79e..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ /dev/null
@@ -1,365 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#ifndef __SWR_OS_H__
-#define __SWR_OS_H__
-
-#include <cstddef>
-#include "core/knobs.h"
-
-#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
-
-#define SWR_API __cdecl
-#define SWR_VISIBLE __declspec(dllexport)
-
-#ifndef NOMINMAX
-#undef UNICODE
-#define NOMINMAX
-#include <windows.h>
-#undef NOMINMAX
-#define UNICODE
-#else
-#undef UNICODE
-#include <windows.h>
-#define UNICODE
-#endif
-#include <intrin.h>
-#include <cstdint>
-
-#if defined(MemoryFence)
-// Windows.h defines MemoryFence as _mm_mfence, but this conflicts with llvm::sys::MemoryFence
-#undef MemoryFence
-#endif
-
-#if defined(_MSC_VER)
-#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD
-#elif defined(__GNUC__)
-#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
-#endif
-
-#if defined(_DEBUG)
-// We compile Debug builds with inline function expansion enabled.  This allows
-// functions compiled with __forceinline to be inlined even in Debug builds.
-// The inline_depth(0) pragma below will disable inline function expansion for
-// normal INLINE / inline functions, but not for __forceinline functions.
-// Our SIMD function wrappers (see simdlib.hpp) use __forceinline even in
-// Debug builds.
-#define INLINE inline
-#pragma inline_depth(0)
-#else
-// Use of __forceinline increases compile time dramatically in release builds
-// and provides almost 0 measurable benefit.  Disable until we have a compelling
-// use-case
-// #define INLINE __forceinline
-#define INLINE inline
-#endif
-#ifndef FORCEINLINE
-#define FORCEINLINE __forceinline
-#endif
-
-#define DEBUGBREAK __debugbreak()
-
-#define PRAGMA_WARNING_PUSH_DISABLE(...) \
-    __pragma(warning(push));             \
-    __pragma(warning(disable : __VA_ARGS__));
-
-#define PRAGMA_WARNING_POP() __pragma(warning(pop))
-
-static inline void* AlignedMalloc(size_t _Size, size_t _Alignment)
-{
-    return _aligned_malloc(_Size, _Alignment);
-}
-
-static inline void AlignedFree(void* p)
-{
-    return _aligned_free(p);
-}
-
-#if defined(_WIN64)
-#define BitScanReverseSizeT BitScanReverse64
-#define BitScanForwardSizeT BitScanForward64
-#define _mm_popcount_sizeT _mm_popcnt_u64
-#else
-#define BitScanReverseSizeT BitScanReverse
-#define BitScanForwardSizeT BitScanForward
-#define _mm_popcount_sizeT _mm_popcnt_u32
-#endif
-
-#if !defined(_WIN64)
-extern "C" {
-inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-#ifdef __GNUC__
-    *Index = __builtin_ctzll(Mask);
-#else
-    *Index = 0;
-    for (int i = 0; i < 64; ++ i)
-      if ((1ULL << i) & Mask)
-        *Index = i;
-#endif
-    return 1;
-}
-
-inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-#ifdef __GNUC__
-    *Index = 63 - __builtin_clzll(Mask);
-#else
-    *Index = 0;
-    for (int i = 63; i >= 0; -- i)
-      if ((1ULL << i) & Mask)
-        *Index = i;
-#endif
-    return 1;
-}
-}
-#endif
-
-#elif defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-
-#define SWR_API
-#define SWR_VISIBLE __attribute__((visibility("default")))
-
-#include <stdlib.h>
-#include <string.h>
-#include <x86intrin.h>
-#include <stdint.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <stdio.h>
-#include <limits.h>
-
-typedef void         VOID;
-typedef void*        LPVOID;
-typedef int          INT;
-typedef unsigned int UINT;
-typedef void*        HANDLE;
-typedef int          LONG;
-typedef unsigned int DWORD;
-
-#undef FALSE
-#define FALSE 0
-
-#undef TRUE
-#define TRUE 1
-
-#define MAX_PATH PATH_MAX
-
-#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
-#ifndef INLINE
-#define INLINE __inline
-#endif
-#ifndef FORCEINLINE
-#define FORCEINLINE INLINE
-#endif
-#define DEBUGBREAK asm("int $3")
-
-#if !defined(__CYGWIN__)
-
-#ifndef __cdecl
-#define __cdecl
-#endif
-#ifndef __stdcall
-#define __stdcall
-#endif
-
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-#define __declspec(x) __declspec_##x
-#define __declspec_align(y) __attribute__((aligned(y)))
-#define __declspec_deprecated __attribute__((deprecated))
-#define __declspec_dllexport
-#define __declspec_dllimport
-#define __declspec_noinline __attribute__((__noinline__))
-#define __declspec_nothrow __attribute__((nothrow))
-#define __declspec_novtable
-#define __declspec_thread __thread
-#else
-#define __declspec(X)
-#endif
-
-#endif
-
-#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-
-#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500)
-inline uint64_t      __rdtsc()
-{
-    long low, high;
-    asm volatile("rdtsc" : "=a"(low), "=d"(high));
-    return (low | ((uint64_t)high << 32));
-}
-#endif
-
-#if !defined(__clang__) && !defined(__INTEL_COMPILER)
-// Intrinsic not defined in gcc < 10
-#if (__GNUC__) && (GCC_VERSION < 100000)
-static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a)
-{
-    _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a));
-    _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1));
-}
-#endif
-
-// gcc prior to 4.9 doesn't have _mm*_undefined_*
-#if (__GNUC__) && (GCC_VERSION < 40900)
-#define _mm_undefined_si128 _mm_setzero_si128
-#define _mm256_undefined_ps _mm256_setzero_ps
-#endif
-#endif
-
-inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-    *Index = __builtin_ctzll(Mask);
-    return 1;
-}
-
-inline unsigned char _BitScanForward(unsigned long* Index, uint32_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-    *Index = __builtin_ctz(Mask);
-    return 1;
-}
-
-inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-    *Index = 63 - __builtin_clzll(Mask);
-    return 1;
-}
-
-inline unsigned char _BitScanReverse(unsigned long* Index, uint32_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-    *Index = 31 - __builtin_clz(Mask);
-    return 1;
-}
-
-inline void* AlignedMalloc(size_t size, size_t alignment)
-{
-    void* ret;
-    if (posix_memalign(&ret, alignment, size))
-    {
-        return NULL;
-    }
-    return ret;
-}
-
-static inline void AlignedFree(void* p)
-{
-    free(p);
-}
-
-#define _countof(a) (sizeof(a) / sizeof(*(a)))
-
-#define sprintf_s sprintf
-#define strcpy_s(dst, size, src) strncpy(dst, src, size)
-#define GetCurrentProcessId getpid
-
-#define InterlockedCompareExchange(Dest, Exchange, Comparand) \
-    __sync_val_compare_and_swap(Dest, Comparand, Exchange)
-#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
-#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
-#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
-#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
-#define InterlockedAdd(Addend, Value) __sync_add_and_fetch(Addend, Value)
-#define InterlockedAdd64(Addend, Value) __sync_add_and_fetch(Addend, Value)
-#define _ReadWriteBarrier() asm volatile("" ::: "memory")
-
-#define PRAGMA_WARNING_PUSH_DISABLE(...)
-#define PRAGMA_WARNING_POP()
-
-#define ZeroMemory(dst, size) memset(dst, 0, size)
-#else
-
-#error Unsupported OS/system.
-
-#endif
-
-#define THREAD thread_local
-
-// Universal types
-typedef uint8_t  KILOBYTE[1024];
-typedef KILOBYTE MEGABYTE[1024];
-typedef MEGABYTE GIGABYTE[1024];
-
-#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
-#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
-#define OSALIGNSIMD16(RWORD) OSALIGN(RWORD, KNOB_SIMD16_BYTES)
-
-#include "common/swr_assert.h"
-
-#ifdef __GNUC__
-#define ATTR_UNUSED __attribute__((unused))
-#else
-#define ATTR_UNUSED
-#endif
-
-#define SWR_FUNC(_retType, _funcName, /* args */...)        \
-    typedef _retType(SWR_API* PFN##_funcName)(__VA_ARGS__); \
-    _retType SWR_API _funcName(__VA_ARGS__);
-
-// Defined in os.cpp
-void SWR_API SetCurrentThreadName(const char* pThreadName);
-void SWR_API CreateDirectoryPath(const std::string& path);
-
-/// Execute Command (block until finished)
-/// @returns process exit value
-int SWR_API
-    ExecCmd(const std::string& cmd,                ///< (In) Command line string
-            const char*  pOptEnvStrings = nullptr, ///< (Optional In) Environment block for new process
-            std::string* pOptStdOut     = nullptr,   ///< (Optional Out) Standard Output text
-            std::string* pOptStdErr     = nullptr,   ///< (Optional Out) Standard Error text
-            const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text
-
-
-/// Helper for setting up FP state
-/// @returns old csr state
-static INLINE uint32_t SetOptimalVectorCSR()
-{
-    uint32_t oldCSR = _mm_getcsr();
-
-    uint32_t newCSR = (oldCSR & ~(_MM_ROUND_MASK | _MM_DENORMALS_ZERO_MASK | _MM_FLUSH_ZERO_MASK));
-    newCSR |= (_MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-    _mm_setcsr(newCSR);
-
-    return oldCSR;
-}
-
-/// Set Vector CSR state.
-/// @param csrState - should be value returned from SetOptimalVectorCSR()
-static INLINE void RestoreVectorCSR(uint32_t csrState)
-{
-    _mm_setcsr(csrState);
-}
-
-#endif //__SWR_OS_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
deleted file mode 100644
index e2076e8fc44..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rdtsc_buckets.cpp
- *
- * @brief implementation of rdtsc buckets.
- *
- * Notes:
- *
- ******************************************************************************/
-#include "rdtsc_buckets.h"
-#include <inttypes.h>
-
-#if defined(_WIN32)
-#define PATH_SEPARATOR "\\"
-#elif defined(__unix__) || defined(__APPLE__)
-#define PATH_SEPARATOR "/"
-#else
-#error "Unsupported platform"
-#endif
-
-THREAD UINT tlsThreadId = 0;
-
-BucketManager::~BucketManager()
-{
-}
-
-void BucketManager::RegisterThread(const std::string& name)
-{
-
-    BUCKET_THREAD newThread;
-    newThread.name = name;
-    newThread.root.children.reserve(mBuckets.size());
-    newThread.root.id      = 0;
-    newThread.root.pParent = nullptr;
-    newThread.pCurrent     = &newThread.root;
-
-    mThreadMutex.lock();
-
-    // assign unique thread id for this thread
-    size_t id    = mThreads.size();
-    newThread.id = (UINT)id;
-    tlsThreadId  = (UINT)id;
-
-    // store new thread
-    mThreads.push_back(newThread);
-
-    mThreadMutex.unlock();
-}
-
-UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
-{
-    mThreadMutex.lock();
-    size_t id = mBuckets.size();
-    mBuckets.push_back(desc);
-    mThreadMutex.unlock();
-    return (UINT)id;
-}
-
-void BucketManager::PrintBucket(
-    FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
-{
-    const char* arrows[] = {
-        "",
-        "|-> ",
-        "    |-> ",
-        "        |-> ",
-        "            |-> ",
-        "                |-> ",
-        "                    |-> ",
-        "                        |-> ",
-        "                            |-> ",
-    };
-
-    // compute percent of total cycles used by this bucket
-    float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0);
-
-    // compute percent of parent cycles used by this bucket
-    float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
-
-    // compute average cycle count per invocation
-    uint64_t CPE = bucket.elapsed / bucket.count;
-
-    BUCKET_DESC& desc = mBuckets[bucket.id];
-
-    // construct hierarchy visualization
-    std::string str = arrows[level];
-    str += desc.name;
-    char hier[80];
-    strcpy_s(hier, sizeof(hier)-1, str.c_str());
-
-    // print out
-    fprintf(f,
-            "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n",
-            percentTotal,
-            percentParent,
-            bucket.elapsed,
-            CPE,
-            bucket.count,
-            (unsigned long)0,
-            (uint32_t)0,
-            hier);
-
-    // dump all children of this bucket
-    for (const BUCKET& child : bucket.children)
-    {
-        if (child.count)
-        {
-            PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child);
-        }
-    }
-}
-
-void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
-{
-    // print header
-    fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str());
-    fprintf(f, " %%Tot   %%Par  Cycles     CPE        NumEvent   CPE2       NumEvent2  Bucket\n");
-
-    // compute thread level total cycle counts across all buckets from root
-    const BUCKET& root        = thread.root;
-    uint64_t      totalCycles = 0;
-    for (const BUCKET& child : root.children)
-    {
-        totalCycles += child.elapsed;
-    }
-
-    for (const BUCKET& child : root.children)
-    {
-        if (child.count)
-        {
-            PrintBucket(f, 0, totalCycles, totalCycles, child);
-        }
-    }
-}
-
-void BucketManager::PrintReport(const std::string& filename)
-{
-    {
-        FILE* f = fopen(filename.c_str(), "w");
-        assert(f);
-
-        mThreadMutex.lock();
-        for (const BUCKET_THREAD& thread : mThreads)
-        {
-            PrintThread(f, thread);
-            fprintf(f, "\n");
-        }
-
-        mThreadMutex.unlock();
-
-        fclose(f);
-    }
-}
-
-
-void BucketManager::StartCapture()
-{
-
-    printf("Capture Starting\n");
-
-    mCapturing = true;
-}
-
-void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id)
-{
-    pBucketMgr->StartBucket(id);
-}
-
-void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id)
-{
-    pBucketMgr->StopBucket(id);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
deleted file mode 100644
index b00cbf63eba..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rdtsc_buckets.h
- *
- * @brief declaration for rdtsc buckets.
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-#include "os.h"
-#include <vector>
-#include <mutex>
-#include <sstream>
-
-#include "rdtsc_buckets_shared.h"
-
-
-// unique thread id stored in thread local storage
-extern THREAD UINT tlsThreadId;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief BucketManager encapsulates a single instance of the buckets
-///        functionality. There can be one or many bucket managers active
-///        at any time.  The manager owns all the threads and
-///        bucket information that have been registered to it.
-class BucketManager
-{
-public:
-
-    uint32_t mCurrentFrame;
-    std::vector<uint32_t> mBucketMap;
-    bool                  mBucketsInitialized;
-    std::string           mBucketMgrName;
-
-
-    BucketManager(std::string name) : mCurrentFrame(0), mBucketsInitialized(false), mBucketMgrName(name) 
-    {
-        mBucketMap.clear();
-    }
-    ~BucketManager();
-
-    // removes all registered thread data
-    void ClearThreads()
-    {
-        mThreadMutex.lock();
-        mThreads.clear();
-        mThreadMutex.unlock();
-    }
-
-    // removes all registered buckets
-    void ClearBuckets()
-    {
-        mThreadMutex.lock();
-        mBuckets.clear();
-        mThreadMutex.unlock();
-    }
-
-    /// Registers a new thread with the manager.
-    /// @param name - name of thread, used for labels in reports and threadviz
-    void RegisterThread(const std::string& name);
-
-    /// Registers a new bucket type with the manager.  Returns a unique
-    /// id which should be used in subsequent calls to start/stop the bucket
-    /// @param desc - description of the bucket
-    /// @return unique id
-    UINT RegisterBucket(const BUCKET_DESC& desc);
-
-    // print report
-    void PrintReport(const std::string& filename);
-
-
-    // start capturing
-    void StartCapture();
-
-    // stop capturing
-    INLINE void StopCapture()
-    {
-        mCapturing = false;
-
-        // wait for all threads to pop back to root bucket
-        bool stillCapturing = true;
-        while (stillCapturing)
-        {
-            stillCapturing = false;
-            for (const BUCKET_THREAD& t : mThreads)
-            {
-                if (t.level > 0)
-                {
-                    stillCapturing = true;
-                    continue;
-                }
-            }
-        }
-
-        mDoneCapturing = true;
-        printf("Capture Stopped\n");
-    }
-
-    // start a bucket
-    // @param id generated by RegisterBucket
-    INLINE void StartBucket(UINT id)
-    {
-        if (!mCapturing)
-            return;
-
-        SWR_ASSERT(tlsThreadId < mThreads.size());
-
-        BUCKET_THREAD& bt = mThreads[tlsThreadId];
-
-        uint64_t tsc = __rdtsc();
-
-        {
-            if (bt.pCurrent->children.size() < mBuckets.size())
-            {
-                bt.pCurrent->children.resize(mBuckets.size());
-            }
-            BUCKET& child = bt.pCurrent->children[id];
-            child.pParent = bt.pCurrent;
-            child.id      = id;
-            child.start   = tsc;
-
-            // update thread's currently executing bucket
-            bt.pCurrent = &child;
-        }
-
-
-        bt.level++;
-    }
-
-    // stop the currently executing bucket
-    INLINE void StopBucket(UINT id)
-    {
-        SWR_ASSERT(tlsThreadId < mThreads.size());
-        BUCKET_THREAD& bt = mThreads[tlsThreadId];
-
-        if (bt.level == 0)
-        {
-            return;
-        }
-
-        uint64_t tsc = __rdtsc();
-
-        {
-            if (bt.pCurrent->start == 0)
-                return;
-            SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected");
-
-            bt.pCurrent->elapsed += (tsc - bt.pCurrent->start);
-            bt.pCurrent->count++;
-
-            // pop to parent
-            bt.pCurrent = bt.pCurrent->pParent;
-        }
-
-        bt.level--;
-    }
-
-    INLINE void AddEvent(uint32_t id, uint32_t count)
-    {
-        if (!mCapturing)
-            return;
-
-        SWR_ASSERT(tlsThreadId < mThreads.size());
-
-        BUCKET_THREAD& bt = mThreads[tlsThreadId];
-
-        // don't record events for threadviz
-        {
-            if (bt.pCurrent->children.size() < mBuckets.size())
-            {
-                bt.pCurrent->children.resize(mBuckets.size());
-            }
-            BUCKET& child = bt.pCurrent->children[id];
-            child.pParent = bt.pCurrent;
-            child.id      = id;
-            child.count += count;
-        }
-    }
-
-private:
-    void PrintBucket(
-        FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
-    void PrintThread(FILE* f, const BUCKET_THREAD& thread);
-
-    // list of active threads that have registered with this manager
-    std::vector<BUCKET_THREAD> mThreads;
-
-    // list of buckets registered with this manager
-    std::vector<BUCKET_DESC> mBuckets;
-
-    // is capturing currently enabled
-    volatile bool mCapturing{false};
-
-    // has capturing completed
-    volatile bool mDoneCapturing{false};
-
-    std::mutex mThreadMutex;
-
-    std::string mThreadVizDir;
-
-};
-
-// C helpers for jitter
-void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id);
-void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id);
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
deleted file mode 100644
index fd3b1df746a..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rdtsc_buckets.h
- *
- * @brief declaration for rdtsc buckets.
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-#include <vector>
-#include <cassert>
-
-struct BUCKET
-{
-    uint32_t id{0};
-    uint64_t start{0};
-    uint64_t elapsed{0};
-    uint32_t count{0};
-
-    BUCKET*             pParent{nullptr};
-    std::vector<BUCKET> children;
-};
-
-struct BUCKET_DESC
-{
-    // name of bucket, used in reports
-    std::string name;
-
-    // description of bucket, used in threadviz
-    std::string description;
-
-    // enable for threadviz dumping
-    bool enableThreadViz;
-
-    // threadviz color of bucket, in RGBA8_UNORM format
-    uint32_t color;
-};
-
-
-struct BUCKET_THREAD
-{
-    // name of thread, used in reports
-    std::string name;
-
-    // id for this thread, assigned by the thread manager
-    uint32_t id{0};
-
-    // root of the bucket hierarchy for this thread
-    BUCKET root;
-
-    // currently executing bucket somewhere in the hierarchy
-    BUCKET* pCurrent{nullptr};
-
-    // currently executing hierarchy level
-    uint32_t level{0};
-
-    // threadviz file object
-    FILE* vizFile{nullptr};
-
-
-    BUCKET_THREAD() {}
-    BUCKET_THREAD(const BUCKET_THREAD& that)
-    {
-        name     = that.name;
-        id       = that.id;
-        root     = that.root;
-        pCurrent = &root;
-        vizFile  = that.vizFile;
-    }
-};
-
-enum VIZ_TYPE
-{
-    VIZ_START = 0,
-    VIZ_STOP  = 1,
-    VIZ_DATA  = 2
-};
-
-struct VIZ_START_DATA
-{
-    uint8_t  type;
-    uint32_t bucketId;
-    uint64_t timestamp;
-};
-
-struct VIZ_STOP_DATA
-{
-    uint8_t  type;
-    uint64_t timestamp;
-};
-
-inline void Serialize(FILE* f, const VIZ_START_DATA& data)
-{
-    fwrite(&data, sizeof(VIZ_START_DATA), 1, f);
-}
-
-inline void Deserialize(FILE* f, VIZ_START_DATA& data)
-{
-    fread(&data, sizeof(VIZ_START_DATA), 1, f);
-    assert(data.type == VIZ_START);
-}
-
-inline void Serialize(FILE* f, const VIZ_STOP_DATA& data)
-{
-    fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f);
-}
-
-inline void Deserialize(FILE* f, VIZ_STOP_DATA& data)
-{
-    fread(&data, sizeof(VIZ_STOP_DATA), 1, f);
-    assert(data.type == VIZ_STOP);
-}
-
-inline void Serialize(FILE* f, const std::string& string)
-{
-    assert(string.size() <= 256);
-
-    uint8_t length = (uint8_t)string.size();
-    fwrite(&length, sizeof(length), 1, f);
-    fwrite(string.c_str(), string.size(), 1, f);
-}
-
-inline void Deserialize(FILE* f, std::string& string)
-{
-    char    cstr[256];
-    uint8_t length;
-    fread(&length, sizeof(length), 1, f);
-    fread(cstr, length, 1, f);
-    cstr[length] = 0;
-    string.assign(cstr);
-}
-
-inline void Serialize(FILE* f, const BUCKET_DESC& desc)
-{
-    Serialize(f, desc.name);
-    Serialize(f, desc.description);
-    fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
-    fwrite(&desc.color, sizeof(desc.color), 1, f);
-}
-
-inline void Deserialize(FILE* f, BUCKET_DESC& desc)
-{
-    Deserialize(f, desc.name);
-    Deserialize(f, desc.description);
-    fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
-    fread(&desc.color, sizeof(desc.color), 1, f);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
deleted file mode 100644
index 5964edff4d3..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#ifndef __SWR_SIMD16INTRIN_H__
-#define __SWR_SIMD16INTRIN_H__
-
-#if KNOB_SIMD16_WIDTH == 16
-typedef SIMD512 SIMD16;
-#else
-#error Unsupported vector width
-#endif // KNOB_SIMD16_WIDTH == 16
-
-#define _simd16_setzero_ps SIMD16::setzero_ps
-#define _simd16_setzero_si SIMD16::setzero_si
-#define _simd16_set1_ps SIMD16::set1_ps
-#define _simd16_set1_epi8 SIMD16::set1_epi8
-#define _simd16_set1_epi32 SIMD16::set1_epi32
-#define _simd16_set_ps SIMD16::set_ps
-#define _simd16_set_epi32 SIMD16::set_epi32
-#define _simd16_load_ps SIMD16::load_ps
-#define _simd16_loadu_ps SIMD16::loadu_ps
-#if 1
-#define _simd16_load1_ps SIMD16::broadcast_ss
-#endif
-#define _simd16_load_si SIMD16::load_si
-#define _simd16_loadu_si SIMD16::loadu_si
-#define _simd16_broadcast_ss(m) SIMD16::broadcast_ss((float const*)m)
-#define _simd16_store_ps SIMD16::store_ps
-#define _simd16_store_si SIMD16::store_si
-#define _simd16_extract_ps(a, imm8) SIMD16::extract_ps<imm8>(a)
-#define _simd16_extract_si(a, imm8) SIMD16::extract_si<imm8>(a)
-#define _simd16_insert_ps(a, b, imm8) SIMD16::insert_ps<imm8>(a, b)
-#define _simd16_insert_si(a, b, imm8) SIMD16::insert_si<imm8>(a, b)
-#define _simd16_maskstore_ps SIMD16::maskstore_ps
-#define _simd16_blend_ps(a, b, mask) SIMD16::blend_ps<mask>(a, b)
-#define _simd16_blendv_ps SIMD16::blendv_ps
-#define _simd16_blendv_epi32 SIMD16::blendv_epi32
-#define _simd16_mul_ps SIMD16::mul_ps
-#define _simd16_div_ps SIMD16::div_ps
-#define _simd16_add_ps SIMD16::add_ps
-#define _simd16_sub_ps SIMD16::sub_ps
-#define _simd16_rsqrt_ps SIMD16::rsqrt_ps
-#define _simd16_min_ps SIMD16::min_ps
-#define _simd16_max_ps SIMD16::max_ps
-#define _simd16_movemask_ps SIMD16::movemask_ps
-#define _simd16_movemask_pd SIMD16::movemask_pd
-#define _simd16_cvtps_epi32 SIMD16::cvtps_epi32
-#define _simd16_cvttps_epi32 SIMD16::cvttps_epi32
-#define _simd16_cvtepi32_ps SIMD16::cvtepi32_ps
-#define _simd16_cmp_ps(a, b, comp) SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b)
-#define _simd16_cmplt_ps SIMD16::cmplt_ps
-#define _simd16_cmpgt_ps SIMD16::cmpgt_ps
-#define _simd16_cmpneq_ps SIMD16::cmpneq_ps
-#define _simd16_cmpeq_ps SIMD16::cmpeq_ps
-#define _simd16_cmpge_ps SIMD16::cmpge_ps
-#define _simd16_cmple_ps SIMD16::cmple_ps
-#define _simd16_castsi_ps SIMD16::castsi_ps
-#define _simd16_castps_si SIMD16::castps_si
-#define _simd16_castsi_pd SIMD16::castsi_pd
-#define _simd16_castpd_si SIMD16::castpd_si
-#define _simd16_castpd_ps SIMD16::castpd_ps
-#define _simd16_castps_pd SIMD16::castps_pd
-#define _simd16_and_ps SIMD16::and_ps
-#define _simd16_andnot_ps SIMD16::andnot_ps
-#define _simd16_or_ps SIMD16::or_ps
-#define _simd16_xor_ps SIMD16::xor_ps
-#define _simd16_round_ps(a, mode) SIMD16::round_ps<SIMD16::RoundMode(mode)>(a)
-#define _simd16_mul_epi32 SIMD16::mul_epi32
-#define _simd16_mullo_epi32 SIMD16::mullo_epi32
-#define _simd16_sub_epi32 SIMD16::sub_epi32
-#define _simd16_sub_epi64 SIMD16::sub_epi64
-#define _simd16_min_epi32 SIMD16::min_epi32
-#define _simd16_max_epi32 SIMD16::max_epi32
-#define _simd16_min_epu32 SIMD16::min_epu32
-#define _simd16_max_epu32 SIMD16::max_epu32
-#define _simd16_add_epi32 SIMD16::add_epi32
-#define _simd16_and_si SIMD16::and_si
-#define _simd16_andnot_si SIMD16::andnot_si
-#define _simd16_or_si SIMD16::or_si
-#define _simd16_xor_si SIMD16::xor_si
-#define _simd16_cmpeq_epi32 SIMD16::cmpeq_epi32
-#define _simd16_cmpgt_epi32 SIMD16::cmpgt_epi32
-#define _simd16_cmplt_epi32 SIMD16::cmplt_epi32
-#define _simd16_testz_ps SIMD16::testz_ps
-#define _simd16_unpacklo_ps SIMD16::unpacklo_ps
-#define _simd16_unpackhi_ps SIMD16::unpackhi_ps
-#define _simd16_unpacklo_pd SIMD16::unpacklo_pd
-#define _simd16_unpackhi_pd SIMD16::unpackhi_pd
-#define _simd16_unpacklo_epi8 SIMD16::unpacklo_epi8
-#define _simd16_unpackhi_epi8 SIMD16::unpackhi_epi8
-#define _simd16_unpacklo_epi16 SIMD16::unpacklo_epi16
-#define _simd16_unpackhi_epi16 SIMD16::unpackhi_epi16
-#define _simd16_unpacklo_epi32 SIMD16::unpacklo_epi32
-#define _simd16_unpackhi_epi32 SIMD16::unpackhi_epi32
-#define _simd16_unpacklo_epi64 SIMD16::unpacklo_epi64
-#define _simd16_unpackhi_epi64 SIMD16::unpackhi_epi64
-#define _simd16_slli_epi32(a, i) SIMD16::slli_epi32<i>(a)
-#define _simd16_srli_epi32(a, i) SIMD16::srli_epi32<i>(a)
-#define _simd16_srai_epi32(a, i) SIMD16::srai_epi32<i>(a)
-#define _simd16_fmadd_ps SIMD16::fmadd_ps
-#define _simd16_fmsub_ps SIMD16::fmsub_ps
-#define _simd16_adds_epu8 SIMD16::adds_epu8
-#define _simd16_subs_epu8 SIMD16::subs_epu8
-#define _simd16_add_epi8 SIMD16::add_epi8
-#define _simd16_shuffle_epi8 SIMD16::shuffle_epi8
-
-#define _simd16_i32gather_ps(m, index, scale) \
-    SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(m, index)
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) \
-    SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask)
-
-#define _simd16_abs_epi32 SIMD16::abs_epi32
-
-#define _simd16_cmpeq_epi64 SIMD16::cmpeq_epi64
-#define _simd16_cmpgt_epi64 SIMD16::cmpgt_epi64
-#define _simd16_cmpeq_epi16 SIMD16::cmpeq_epi16
-#define _simd16_cmpgt_epi16 SIMD16::cmpgt_epi16
-#define _simd16_cmpeq_epi8 SIMD16::cmpeq_epi8
-#define _simd16_cmpgt_epi8 SIMD16::cmpgt_epi8
-
-#define _simd16_permute_ps_i(a, i) SIMD16::permute_ps<i>(a)
-#define _simd16_permute_ps SIMD16::permute_ps
-#define _simd16_permute_epi32 SIMD16::permute_epi32
-#define _simd16_sllv_epi32 SIMD16::sllv_epi32
-#define _simd16_srlv_epi32 SIMD16::sllv_epi32
-#define _simd16_permute2f128_ps(a, b, i) SIMD16::permute2f128_ps<i>(a, b)
-#define _simd16_permute2f128_pd(a, b, i) SIMD16::permute2f128_pd<i>(a, b)
-#define _simd16_permute2f128_si(a, b, i) SIMD16::permute2f128_si<i>(a, b)
-#define _simd16_shuffle_ps(a, b, i) SIMD16::shuffle_ps<i>(a, b)
-#define _simd16_shuffle_pd(a, b, i) SIMD16::shuffle_pd<i>(a, b)
-#define _simd16_shuffle_epi32(a, b, imm8) SIMD16::shuffle_epi32<imm8>(a, b)
-#define _simd16_shuffle_epi64(a, b, imm8) SIMD16::shuffle_epi64<imm8>(a, b)
-#define _simd16_cvtepu8_epi16 SIMD16::cvtepu8_epi16
-#define _simd16_cvtepu8_epi32 SIMD16::cvtepu8_epi32
-#define _simd16_cvtepu16_epi32 SIMD16::cvtepu16_epi32
-#define _simd16_cvtepu16_epi64 SIMD16::cvtepu16_epi64
-#define _simd16_cvtepu32_epi64 SIMD16::cvtepu32_epi64
-#define _simd16_packus_epi16 SIMD16::packus_epi16
-#define _simd16_packs_epi16 SIMD16::packs_epi16
-#define _simd16_packus_epi32 SIMD16::packus_epi32
-#define _simd16_packs_epi32 SIMD16::packs_epi32
-#define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
-#define _simd16_cmpeq_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>
-#define _simd16_int2mask(mask) simd16mask(mask)
-#define _simd16_mask2int(mask) int(mask)
-#define _simd16_vmask_ps SIMD16::vmask_ps
-
-#endif //__SWR_SIMD16INTRIN_H_
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
deleted file mode 100644
index ebb4f4b7f11..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ /dev/null
@@ -1,322 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#ifndef __SWR_SIMDINTRIN_H__
-#define __SWR_SIMDINTRIN_H__
-
-#include "common/intrin.h"
-#include "common/simdlib.hpp"
-
-#if KNOB_SIMD_WIDTH == 8
-typedef SIMD256 SIMD;
-#else
-#error Unsupported vector width
-#endif // KNOB_SIMD16_WIDTH == 16
-
-#define _simd128_maskstore_ps SIMD128::maskstore_ps
-#define _simd128_fmadd_ps SIMD128::fmadd_ps
-
-#define _simd_load_ps SIMD::load_ps
-#define _simd_load1_ps SIMD::broadcast_ss
-#define _simd_loadu_ps SIMD::loadu_ps
-#define _simd_setzero_ps SIMD::setzero_ps
-#define _simd_set1_ps SIMD::set1_ps
-#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b)
-#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b)
-#define _simd_blendv_ps SIMD::blendv_ps
-#define _simd_store_ps SIMD::store_ps
-#define _simd_mul_ps SIMD::mul_ps
-#define _simd_add_ps SIMD::add_ps
-#define _simd_sub_ps SIMD::sub_ps
-#define _simd_rsqrt_ps SIMD::rsqrt_ps
-#define _simd_min_ps SIMD::min_ps
-#define _simd_max_ps SIMD::max_ps
-#define _simd_movemask_ps SIMD::movemask_ps
-#define _simd_cvtps_epi32 SIMD::cvtps_epi32
-#define _simd_cvttps_epi32 SIMD::cvttps_epi32
-#define _simd_cvtepi32_ps SIMD::cvtepi32_ps
-#define _simd_cmplt_ps SIMD::cmplt_ps
-#define _simd_cmpgt_ps SIMD::cmpgt_ps
-#define _simd_cmpneq_ps SIMD::cmpneq_ps
-#define _simd_cmpeq_ps SIMD::cmpeq_ps
-#define _simd_cmpge_ps SIMD::cmpge_ps
-#define _simd_cmple_ps SIMD::cmple_ps
-#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
-#define _simd_and_ps SIMD::and_ps
-#define _simd_or_ps SIMD::or_ps
-#define _simd_rcp_ps SIMD::rcp_ps
-#define _simd_div_ps SIMD::div_ps
-#define _simd_castsi_ps SIMD::castsi_ps
-#define _simd_castps_pd SIMD::castps_pd
-#define _simd_castpd_ps SIMD::castpd_ps
-#define _simd_andnot_ps SIMD::andnot_ps
-#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a)
-#define _simd_castpd_ps SIMD::castpd_ps
-#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const*)(a))
-#define _simd_stream_ps SIMD::stream_ps
-
-#define _simd_movemask_pd SIMD::movemask_pd
-#define _simd_castsi_pd SIMD::castsi_pd
-
-#define _simd_mul_epi32 SIMD::mul_epi32
-#define _simd_mullo_epi32 SIMD::mullo_epi32
-#define _simd_sub_epi32 SIMD::sub_epi32
-#define _simd_sub_epi64 SIMD::sub_epi64
-#define _simd_min_epi32 SIMD::min_epi32
-#define _simd_min_epu32 SIMD::min_epu32
-#define _simd_max_epi32 SIMD::max_epi32
-#define _simd_max_epu32 SIMD::max_epu32
-#define _simd_add_epi32 SIMD::add_epi32
-#define _simd_and_si SIMD::and_si
-#define _simd_andnot_si SIMD::andnot_si
-#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32
-#define _simd_cmplt_epi32 SIMD::cmplt_epi32
-#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32
-#define _simd_or_si SIMD::or_si
-#define _simd_xor_si SIMD::xor_si
-#define _simd_castps_si SIMD::castps_si
-#define _simd_adds_epu8 SIMD::adds_epu8
-#define _simd_subs_epu8 SIMD::subs_epu8
-#define _simd_add_epi8 SIMD::add_epi8
-#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64
-#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64
-#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8
-#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8
-#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16
-#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16
-#define _simd_movemask_epi8 SIMD::movemask_epi8
-#define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a)
-#define _simd_permute_ps SIMD::permute_ps
-#define _simd_permute_epi32 SIMD::permute_epi32
-#define _simd_srlv_epi32 SIMD::srlv_epi32
-#define _simd_sllv_epi32 SIMD::sllv_epi32
-
-#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8
-#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8
-#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16
-#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16
-#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32
-#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32
-#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64
-#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64
-
-#define _simd_slli_epi32(a, i) SIMD::slli_epi32<i>(a)
-#define _simd_srai_epi32(a, i) SIMD::srai_epi32<i>(a)
-#define _simd_srli_epi32(a, i) SIMD::srli_epi32<i>(a)
-#define _simd_srlisi_ps(a, i) SIMD::srlisi_ps<i>(a)
-
-#define _simd_fmadd_ps SIMD::fmadd_ps
-#define _simd_fmsub_ps SIMD::fmsub_ps
-#define _simd_shuffle_epi8 SIMD::shuffle_epi8
-
-#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
-#define _simd_mask_i32gather_ps(r, p, o, m, s) \
-    SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
-#define _simd_abs_epi32 SIMD::abs_epi32
-
-#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16
-#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32
-#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32
-#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64
-#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64
-
-#define _simd_packus_epi16 SIMD::packus_epi16
-#define _simd_packs_epi16 SIMD::packs_epi16
-#define _simd_packus_epi32 SIMD::packus_epi32
-#define _simd_packs_epi32 SIMD::packs_epi32
-
-#define _simd_unpacklo_ps SIMD::unpacklo_ps
-#define _simd_unpackhi_ps SIMD::unpackhi_ps
-#define _simd_unpacklo_pd SIMD::unpacklo_pd
-#define _simd_unpackhi_pd SIMD::unpackhi_pd
-#define _simd_insertf128_ps SIMD::insertf128_ps
-#define _simd_insertf128_pd SIMD::insertf128_pd
-#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b)
-#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a)
-#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a)
-#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a)
-#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b)
-#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b)
-#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b)
-#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b)
-#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b)
-#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b)
-#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b)
-#define _simd_set1_epi32 SIMD::set1_epi32
-#define _simd_set_epi32 SIMD::set_epi32
-#define _simd_set_ps SIMD::set_ps
-#define _simd_set1_epi8 SIMD::set1_epi8
-#define _simd_setzero_si SIMD::setzero_si
-#define _simd_cvttps_epi32 SIMD::cvttps_epi32
-#define _simd_store_si SIMD::store_si
-#define _simd_broadcast_ss SIMD::broadcast_ss
-#define _simd_maskstore_ps SIMD::maskstore_ps
-#define _simd_load_si SIMD::load_si
-#define _simd_loadu_si SIMD::loadu_si
-#define _simd_sub_ps SIMD::sub_ps
-#define _simd_testz_ps SIMD::testz_ps
-#define _simd_testz_si SIMD::testz_si
-#define _simd_xor_ps SIMD::xor_ps
-
-#define _simd_loadu2_si SIMD::loadu2_si
-#define _simd_storeu2_si SIMD::storeu2_si
-
-#define _simd_blendv_epi32 SIMD::blendv_epi32
-#define _simd_vmask_ps SIMD::vmask_ps
-
-template <int mask>
-SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD128::Integer const& b)
-{
-    return SIMD128::castps_si(
-        SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Compute plane equation vA * vX + vB * vY + vC
-SIMDINLINE simdscalar vplaneps(simdscalar const& vA,
-                               simdscalar const& vB,
-                               simdscalar const& vC,
-                               simdscalar const& vX,
-                               simdscalar const& vY)
-{
-    simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
-    vOut            = _simd_fmadd_ps(vB, vY, vOut);
-    return vOut;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Compute plane equation vA * vX + vB * vY + vC
-SIMDINLINE simd4scalar vplaneps(simd4scalar const& vA,
-                                simd4scalar const& vB,
-                                simd4scalar const& vC,
-                                simd4scalar const& vX,
-                                simd4scalar const& vY)
-{
-    simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
-    vOut             = _simd128_fmadd_ps(vB, vY, vOut);
-    return vOut;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Interpolates a single component.
-/// @param vI - barycentric I
-/// @param vJ - barycentric J
-/// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template <UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simdscalar InterpolateComponent(simdscalar const& vI,
-                                                  simdscalar const& vJ,
-                                                  const float*      pInterpBuffer)
-{
-    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
-    const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
-    const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
-
-    if ((pInterpA[0] == pInterpB[0]) && (pInterpA[0] == pInterpC[0]))
-    {
-        // Ensure constant attribs are constant.  Required for proper
-        // 3D resource copies.
-        return _simd_broadcast_ss(pInterpA);
-    }
-
-    simdscalar vA = _simd_broadcast_ss(pInterpA);
-    simdscalar vB = _simd_broadcast_ss(pInterpB);
-    simdscalar vC = _simd_broadcast_ss(pInterpC);
-
-    simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
-    vC            = _simd_mul_ps(vk, vC);
-
-    return vplaneps(vA, vB, vC, vI, vJ);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Interpolates a single component (flat shade).
-/// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template <UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simdscalar InterpolateComponentFlat(const float* pInterpBuffer)
-{
-    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
-
-    simdscalar vA = _simd_broadcast_ss(pInterpA);
-
-    return vA;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Interpolates a single component (flat shade).
-/// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template <UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simdscalari InterpolateComponentFlatInt(const uint32_t* pInterpBuffer)
-{
-    const uint32_t interpA = pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
-
-    simdscalari vA = _simd_set1_epi32(interpA);
-
-    return vA;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Interpolates a single component.
-/// @param vI - barycentric I
-/// @param vJ - barycentric J
-/// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template <UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const& vI,
-                                                   simd4scalar const& vJ,
-                                                   const float*       pInterpBuffer)
-{
-    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
-    const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
-    const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
-
-    if ((pInterpA[0] == pInterpB[0]) && (pInterpA[0] == pInterpC[0]))
-    {
-        // Ensure constant attribs are constant.  Required for proper
-        // 3D resource copies.
-        return SIMD128::broadcast_ss(pInterpA);
-    }
-
-    simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
-    simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
-    simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
-
-    simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
-    vC             = SIMD128::mul_ps(vk, vC);
-
-    return vplaneps(vA, vB, vC, vI, vJ);
-}
-
-static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const& a)
-{
-    simd4scalari ai = SIMD128::castps_si(a);
-    return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
-}
-
-static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a)
-{
-    simdscalari ai = _simd_castps_si(a);
-    return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
-}
-
-#include "simd16intrin.h"
-
-#endif //__SWR_SIMDINTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
deleted file mode 100644
index 53793ba101c..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#pragma once
-
-#include "simdlib_types.hpp"
-
-// For documentation, please see the following include...
-// #include "simdlib_interface.hpp"
-
-namespace SIMDImpl
-{
-    namespace SIMD128Impl
-    {
-#if SIMD_ARCH >= SIMD_ARCH_AVX
-        struct AVXImpl
-        {
-#define __SIMD_LIB_AVX_HPP__
-#include "simdlib_128_avx.inl"
-#undef __SIMD_LIB_AVX_HPP__
-        }; // struct AVXImpl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX2
-        struct AVX2Impl : AVXImpl
-        {
-#define __SIMD_LIB_AVX2_HPP__
-#include "simdlib_128_avx2.inl"
-#undef __SIMD_LIB_AVX2_HPP__
-        }; // struct AVX2Impl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX2
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-        struct AVX512Impl : AVX2Impl
-        {
-#if defined(SIMD_OPT_128_AVX512)
-#define __SIMD_LIB_AVX512_HPP__
-#include "simdlib_128_avx512.inl"
-#if defined(SIMD_ARCH_KNIGHTS)
-#include "simdlib_128_avx512_knights.inl"
-#else // optimize for core
-#include "simdlib_128_avx512_core.inl"
-#endif // defined(SIMD_ARCH_KNIGHTS)
-#undef __SIMD_LIB_AVX512_HPP__
-#endif     // SIMD_OPT_128_AVX512
-        }; // struct AVX2Impl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
-
-        struct Traits : SIMDImpl::Traits
-        {
-#if SIMD_ARCH == SIMD_ARCH_AVX
-            using IsaImpl = AVXImpl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX2
-            using IsaImpl = AVX2Impl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX512
-            using IsaImpl = AVX512Impl;
-#else
-#error Invalid value for SIMD_ARCH
-#endif
-
-            using Float   = SIMD128Impl::Float;
-            using Double  = SIMD128Impl::Double;
-            using Integer = SIMD128Impl::Integer;
-            using Vec4    = SIMD128Impl::Vec4;
-            using Mask    = SIMD128Impl::Mask;
-        };
-    } // namespace SIMD128Impl
-
-    namespace SIMD256Impl
-    {
-#if SIMD_ARCH >= SIMD_ARCH_AVX
-        struct AVXImpl
-        {
-#define __SIMD_LIB_AVX_HPP__
-#include "simdlib_256_avx.inl"
-#undef __SIMD_LIB_AVX_HPP__
-        }; // struct AVXImpl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX2
-        struct AVX2Impl : AVXImpl
-        {
-#define __SIMD_LIB_AVX2_HPP__
-#include "simdlib_256_avx2.inl"
-#undef __SIMD_LIB_AVX2_HPP__
-        }; // struct AVX2Impl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX2
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-        struct AVX512Impl : AVX2Impl
-        {
-#if defined(SIMD_OPT_256_AVX512)
-#define __SIMD_LIB_AVX512_HPP__
-#include "simdlib_256_avx512.inl"
-#if defined(SIMD_ARCH_KNIGHTS)
-#include "simdlib_256_avx512_knights.inl"
-#else // optimize for core
-#include "simdlib_256_avx512_core.inl"
-#endif // defined(SIMD_ARCH_KNIGHTS)
-#undef __SIMD_LIB_AVX512_HPP__
-#endif     // SIMD_OPT_256_AVX512
-        }; // struct AVX2Impl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
-
-        struct Traits : SIMDImpl::Traits
-        {
-#if SIMD_ARCH == SIMD_ARCH_AVX
-            using IsaImpl = AVXImpl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX2
-            using IsaImpl = AVX2Impl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX512
-            using IsaImpl = AVX512Impl;
-#else
-#error Invalid value for SIMD_ARCH
-#endif
-
-            using Float   = SIMD256Impl::Float;
-            using Double  = SIMD256Impl::Double;
-            using Integer = SIMD256Impl::Integer;
-            using Vec4    = SIMD256Impl::Vec4;
-            using Mask    = SIMD256Impl::Mask;
-        };
-    } // namespace SIMD256Impl
-
-    namespace SIMD512Impl
-    {
-#if SIMD_ARCH >= SIMD_ARCH_AVX
-        template <typename SIMD256T>
-        struct AVXImplBase
-        {
-#define __SIMD_LIB_AVX_HPP__
-#include "simdlib_512_emu.inl"
-#include "simdlib_512_emu_masks.inl"
-#undef __SIMD_LIB_AVX_HPP__
-        }; // struct AVXImplBase
-        using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX2
-        using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-        struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
-        {
-#define __SIMD_LIB_AVX512_HPP__
-#include "simdlib_512_avx512.inl"
-#include "simdlib_512_avx512_masks.inl"
-#if defined(SIMD_ARCH_KNIGHTS)
-#include "simdlib_512_avx512_knights.inl"
-#include "simdlib_512_avx512_masks_knights.inl"
-#else // optimize for core
-#include "simdlib_512_avx512_core.inl"
-#include "simdlib_512_avx512_masks_core.inl"
-#endif // defined(SIMD_ARCH_KNIGHTS)
-#undef __SIMD_LIB_AVX512_HPP__
-        }; // struct AVX512ImplBase
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
-
-        struct Traits : SIMDImpl::Traits
-        {
-#if SIMD_ARCH == SIMD_ARCH_AVX
-            using IsaImpl = AVXImpl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX2
-            using IsaImpl = AVX2Impl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX512
-            using IsaImpl = AVX512Impl;
-#else
-#error Invalid value for SIMD_ARCH
-#endif
-
-            using Float   = SIMD512Impl::Float;
-            using Double  = SIMD512Impl::Double;
-            using Integer = SIMD512Impl::Integer;
-            using Vec4    = SIMD512Impl::Vec4;
-            using Mask    = SIMD512Impl::Mask;
-        };
-    } // namespace SIMD512Impl
-} // namespace SIMDImpl
-
-template <typename Traits>
-struct SIMDBase : Traits::IsaImpl
-{
-    using CompareType = typename Traits::CompareType;
-    using ScaleFactor = typename Traits::ScaleFactor;
-    using RoundMode   = typename Traits::RoundMode;
-    using SIMD        = typename Traits::IsaImpl;
-    using Float       = typename Traits::Float;
-    using Double      = typename Traits::Double;
-    using Integer     = typename Traits::Integer;
-    using Vec4        = typename Traits::Vec4;
-    using Mask        = typename Traits::Mask;
-}; // struct SIMDBase
-
-using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
-using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
-using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
-
-template <typename SIMD_T>
-using CompareType = typename SIMD_T::CompareType;
-template <typename SIMD_T>
-using ScaleFactor = typename SIMD_T::ScaleFactor;
-template <typename SIMD_T>
-using RoundMode = typename SIMD_T::RoundMode;
-template <typename SIMD_T>
-using Float = typename SIMD_T::Float;
-template <typename SIMD_T>
-using Double = typename SIMD_T::Double;
-template <typename SIMD_T>
-using Integer = typename SIMD_T::Integer;
-template <typename SIMD_T>
-using Vec4 = typename SIMD_T::Vec4;
-template <typename SIMD_T>
-using Mask = typename SIMD_T::Mask;
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
deleted file mode 100644
index 83ce967373c..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
+++ /dev/null
@@ -1,593 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD128 AVX (1) implementation
-//============================================================================
-
-#define SIMD_WRAPPER_1(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a) { return _mm_##op(a); }
-
-#define SIMD_WRAPPER_2(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm_##op(a, b); }
-
-#define SIMD_DWRAPPER_2(op) \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm_##op(a, b); }
-
-#define SIMD_WRAPPER_2I(op)                               \
-    template <int ImmT>                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
-    {                                                     \
-        return _mm_##op(a, b, ImmT);                      \
-    }
-
-#define SIMD_DWRAPPER_2I(op)                                 \
-    template <int ImmT>                                      \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
-    {                                                        \
-        return _mm_##op(a, b, ImmT);                         \
-    }
-
-#define SIMD_WRAPPER_3(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm_##op(a); }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)                \
-    template <int ImmT>                              \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) \
-    {                                                \
-        return intrin(a, ImmT);                      \
-    }
-#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
-
-#define SIMD_IWRAPPER_2_(op, intrin) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return intrin(a, b); }
-
-#define SIMD_IWRAPPER_2(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm_##op(a, b); }
-
-#define SIMD_IFWRAPPER_2(op, intrin)                            \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
-    {                                                           \
-        return castps_si(intrin(castsi_ps(a), castsi_ps(b)));   \
-    }
-
-#define SIMD_IWRAPPER_2I(op)                                    \
-    template <int ImmT>                                         \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
-    {                                                           \
-        return _mm_##op(a, b, ImmT);                            \
-    }
-
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);   // return a + b
-SIMD_WRAPPER_2(div_ps);   // return a / b
-SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);   // return a * b
-SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
-SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);   // return a - b
-
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
-{
-    return add_ps(mul_ps(a, b), c);
-}
-static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
-{
-    return sub_ps(mul_ps(a, b), c);
-}
-
-template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float a)
-{
-    return _mm_round_ps(a, static_cast<int>(RMT));
-}
-
-static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
-{
-    return round_ps<RoundMode::CEIL_NOEXC>(a);
-}
-static SIMDINLINE Float SIMDCALL floor_ps(Float a)
-{
-    return round_ps<RoundMode::FLOOR_NOEXC>(a);
-}
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
-SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2(mullo_epi32);
-SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
-SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(and_ps);                        // return a & b       (float treated as int)
-SIMD_IWRAPPER_2_(and_si, _mm_and_si128);       // return a & b       (int)
-SIMD_WRAPPER_2(andnot_ps);                     // return (~a) & b    (float treated as int)
-SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b    (int)
-SIMD_WRAPPER_2(or_ps);                         // return a | b       (float treated as int)
-SIMD_IWRAPPER_2_(or_si, _mm_or_si128);         // return a | b       (int)
-SIMD_WRAPPER_2(xor_ps);                        // return a ^ b       (float treated as int)
-SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);       // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
-SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT
-
-static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
-{
-    int32_t a, count;
-    a     = _mm_extract_epi32(vA, 0);
-    count = _mm_extract_epi32(vB, 0);
-    a <<= count;
-    vA = _mm_insert_epi32(vA, a, 0);
-
-    a     = _mm_extract_epi32(vA, 1);
-    count = _mm_extract_epi32(vB, 1);
-    a <<= count;
-    vA = _mm_insert_epi32(vA, a, 1);
-
-    a     = _mm_extract_epi32(vA, 2);
-    count = _mm_extract_epi32(vB, 2);
-    a <<= count;
-    vA = _mm_insert_epi32(vA, a, 2);
-
-    a     = _mm_extract_epi32(vA, 3);
-    count = _mm_extract_epi32(vB, 3);
-    a <<= count;
-    vA = _mm_insert_epi32(vA, a, 3);
-
-    return vA;
-}
-
-SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
-
-static SIMDINLINE Integer SIMDCALL srl_epi64(Integer a, Integer n)
-{
-    return _mm_srl_epi64(a, n);
-}
-
-template <int ImmT> // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
-{
-    return castsi_ps(srli_si<ImmT>(castps_si(a)));
-}
-
-static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
-{
-    int32_t a, count;
-    a     = _mm_extract_epi32(vA, 0);
-    count = _mm_extract_epi32(vB, 0);
-    a >>= count;
-    vA = _mm_insert_epi32(vA, a, 0);
-
-    a     = _mm_extract_epi32(vA, 1);
-    count = _mm_extract_epi32(vB, 1);
-    a >>= count;
-    vA = _mm_insert_epi32(vA, a, 1);
-
-    a     = _mm_extract_epi32(vA, 2);
-    count = _mm_extract_epi32(vB, 2);
-    a >>= count;
-    vA = _mm_insert_epi32(vA, a, 2);
-
-    a     = _mm_extract_epi32(vA, 3);
-    count = _mm_extract_epi32(vB, 3);
-    a >>= count;
-    vA = _mm_insert_epi32(vA, a, 3);
-
-    return vA;
-}
-
-//-----------------------------------------------------------------------
-// Conversion operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
-{
-    return _mm_castpd_ps(a);
-}
-
-static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
-{
-    return _mm_castps_si128(a);
-}
-
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
-{
-    return _mm_castsi128_pd(a);
-}
-
-static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
-{
-    return _mm_castps_pd(a);
-}
-
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
-{
-    return _mm_castsi128_ps(a);
-}
-
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
-{
-    return _mm_cvtepi32_ps(a);
-}
-
-static SIMDINLINE int32_t SIMDCALL cvtsi128_si32(Integer a) // return a.v[0]
-{
-    return _mm_cvtsi128_si32(a);
-}
-
-static SIMDINLINE Integer SIMDCALL cvtsi32_si128(int32_t n) // return a[0] = n, a[1]...a[3] = 0
-{
-    return _mm_cvtsi32_si128(n);
-}
-
-SIMD_IWRAPPER_1(cvtepu8_epi16);  // return (int16)a    (uint8 --> int16)
-SIMD_IWRAPPER_1(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
-SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
-SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
-SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
-
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a    (float --> int32)
-{
-    return _mm_cvtps_epi32(a);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvttps_epi32(Float a) // return (int32)a    (rnd_to_zero(float) --> int32)
-{
-    return _mm_cvttps_epi32(a);
-}
-
-//-----------------------------------------------------------------------
-// Comparison operations
-//-----------------------------------------------------------------------
-template <CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
-{
-    return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
-}
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::LT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::GT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::NEQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::EQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::GE_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::LE_OQ>(a, b);
-}
-
-SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
-SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
-SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
-SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
-SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
-SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
-SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
-SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
-SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL testz_ps(Float a,
-                                         Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
-{
-    return 0 != _mm_testz_ps(a, b);
-}
-
-static SIMDINLINE bool SIMDCALL testz_si(Integer a,
-                                         Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
-{
-    return 0 != _mm_testz_si128(a, b);
-}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a  (float)
-SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a  (float)
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
-                                                Integer b,
-                                                Float   mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
-}
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
-                                                Integer b,
-                                                Integer mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
-{
-    return _mm_broadcast_ss(p);
-}
-
-SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
-
-static SIMDINLINE Integer SIMDCALL
-                          permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    return _mm_permutevar_ps(a, swiz);
-}
-
-SIMD_IWRAPPER_1I(shuffle_epi32);
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
-
-SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_DWRAPPER_2I(shuffle_pd);
-SIMD_WRAPPER_2I(shuffle_ps);
-SIMD_IWRAPPER_2(unpackhi_epi16);
-
-// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
-static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
-{
-    return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
-}
-
-SIMD_IWRAPPER_2(unpackhi_epi64);
-SIMD_IWRAPPER_2(unpackhi_epi8);
-SIMD_DWRAPPER_2(unpackhi_pd);
-SIMD_WRAPPER_2(unpackhi_ps);
-SIMD_IWRAPPER_2(unpacklo_epi16);
-SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
-SIMD_IWRAPPER_2(unpacklo_epi64);
-SIMD_IWRAPPER_2(unpacklo_epi8);
-SIMD_DWRAPPER_2(unpacklo_pd);
-SIMD_WRAPPER_2(unpacklo_ps);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    uint32_t* pOffsets = (uint32_t*)&idx;
-    Float     vResult;
-    float*    pResult = (float*)&vResult;
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        uint32_t offset = pOffsets[i];
-        offset          = offset * static_cast<uint32_t>(ScaleT);
-        pResult[i]      = *(float const*)(((uint8_t const*)p + offset));
-    }
-
-    return vResult;
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
-{
-    return broadcast_ss(p);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return _mm_load_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return _mm_load_si128(&p->v);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return _mm_loadu_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return _mm_lddqu_si128(&p->v);
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
-{
-    uint32_t* pOffsets = (uint32_t*)&idx;
-    Float     vResult  = old;
-    float*    pResult  = (float*)&vResult;
-    unsigned long index;
-    uint32_t  umask = movemask_ps(mask);
-    while (_BitScanForward(&index, umask))
-    {
-        umask &= ~(1 << index);
-        uint32_t offset = pOffsets[index];
-        offset          = offset * static_cast<uint32_t>(ScaleT);
-        pResult[index]  = *(float const*)(((uint8_t const*)p + offset));
-    }
-
-    return vResult;
-}
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
-{
-    _mm_maskstore_ps(p, mask, src);
-}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-{
-    return static_cast<uint32_t>(_mm_movemask_epi8(a));
-}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
-{
-    return static_cast<uint32_t>(_mm_movemask_pd(a));
-}
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
-{
-    return static_cast<uint32_t>(_mm_movemask_ps(a));
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
-{
-    return _mm_set1_epi32(i);
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
-{
-    return _mm_set1_epi8(i);
-}
-
-static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
-{
-    return _mm_set1_ps(f);
-}
-
-static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
-{
-    return _mm_setzero_ps();
-}
-
-static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
-{
-    return _mm_setzero_si128();
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
-{
-    _mm_store_ps(p, a);
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
-{
-    _mm_store_si128(&p->v, a);
-}
-
-static SIMDINLINE void SIMDCALL
-                       storeu_si(Integer* p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
-{
-    _mm_storeu_si128(&p->v, a);
-}
-
-static SIMDINLINE void SIMDCALL
-                       stream_ps(float* p, Float a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
-{
-    _mm_stream_ps(p, a);
-}
-
-static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
-{
-    return _mm_set_ps(in3, in2, in1, in0);
-}
-
-static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0)
-{
-    return _mm_set_epi32(in3, in2, in1, in0);
-}
-
-template <int ImmT>
-static SIMDINLINE float SIMDCALL extract_ps(Float a)
-{
-    int tmp = _mm_extract_ps(a, ImmT);
-    return *reinterpret_cast<float*>(&tmp);
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    Integer       vec = set1_epi32(mask);
-    const Integer bit = set_epi32(0x08, 0x04, 0x02, 0x01);
-    vec               = and_si(vec, bit);
-    vec               = cmplt_epi32(setzero_si(), vec);
-    return castsi_ps(vec);
-}
-
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IFWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_1I
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
deleted file mode 100644
index 0da66ebb56c..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
+++ /dev/null
@@ -1,66 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX2_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD4 AVX (2) implementation
-//
-// Since this implementation inherits from the AVX (1) implementation,
-// the only operations below ones that replace AVX (1) operations.
-// Only 2 shifts and 2 gathers were introduced with AVX 2
-// Also, add native support for FMA operations
-//============================================================================
-#define SIMD_WRAPPER_3(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
-
-SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
-
-static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
-{
-    return _mm_sllv_epi32(vA, vB);
-}
-
-static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
-{
-    return _mm_srlv_epi32(vA, vB);
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
-{
-    return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
-}
-
-#undef SIMD_WRAPPER_3
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
deleted file mode 100644
index b076daa080a..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
+++ /dev/null
@@ -1,368 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD128 AVX (512) implementation
-//
-// Since this implementation inherits from the AVX (2) implementation,
-// the only operations below ones that replace AVX (2) operations.
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
-
-private:
-static SIMDINLINE __m512 __conv(Float r)
-{
-    return _mm512_castps128_ps512(r.v);
-}
-static SIMDINLINE __m512d __conv(Double r)
-{
-    return _mm512_castpd128_pd512(r.v);
-}
-static SIMDINLINE __m512i __conv(Integer r)
-{
-    return _mm512_castsi128_si512(r.v);
-}
-static SIMDINLINE Float __conv(__m512 r)
-{
-    return _mm512_castps512_ps128(r);
-}
-static SIMDINLINE Double __conv(__m512d r)
-{
-    return _mm512_castpd512_pd128(r);
-}
-static SIMDINLINE Integer __conv(__m512i r)
-{
-    return _mm512_castsi512_si128(r);
-}
-
-public:
-#define SIMD_WRAPPER_1_(op, intrin, mask)                        \
-    static SIMDINLINE Float SIMDCALL op(Float a)                 \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_1I_(op, intrin, mask)                             \
-    template <int ImmT>                                                \
-    static SIMDINLINE Float SIMDCALL op(Float a)                       \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_2_(op, intrin, mask)                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_2I(op)                                                \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                  \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_WRAPPER_3_(op, intrin, mask)                                              \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)                     \
-    {                                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
-    }
-#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
-
-#define SIMD_DWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)               \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
-
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
-
-#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
-
-#define SIMD_IWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)            \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
-    }
-
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);                                // return a + b
-SIMD_WRAPPER_2(div_ps);                                // return a / b
-SIMD_WRAPPER_3(fmadd_ps);                              // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);                              // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);                                // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);                                // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);                                // return a * b
-SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xf));     // return 1.0f / a
-SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xf)); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);                                // return a - b
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
-SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
-
-// SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
-// SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2_32(mullo_epi32);
-SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
-
-// SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
-// SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf));       // return a & b       (int)
-SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf));         // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf));       // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
-SIMD_IWRAPPER_2_32(sllv_epi32);  // return a << b      (uint32)
-SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_2_32(srlv_epi32);  // return a >> b      (uint32)
-
-// use AVX2 version
-// SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
-
-//-----------------------------------------------------------------------
-// Conversion operations (Use AVX2 versions)
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
-// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
-// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
-// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
-// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
-
-//-----------------------------------------------------------------------
-// Comparison operations (Use AVX2 versions
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
-//
-// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
-//{
-//    return cmpgt_epi32(b, a);
-//}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16
-// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation
-// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 -->
-// uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-// SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for
-// _mm256_packus_epi32 and _mm512_packus_epi32 SIMD_IWRAPPER_2_(permute_epi32,
-// permutevar8x32_epi32);
-
-// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for
-// each 32-bit lane i (float)
-//{
-//    return _mm256_permutevar8x32_ps(a, swiz);
-//}
-
-SIMD_IWRAPPER_1I_32(shuffle_epi32);
-// template<int ImmT>
-// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
-//{
-//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
-//}
-// SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_IWRAPPER_2_32(unpackhi_epi32);
-SIMD_IWRAPPER_2_32(unpacklo_epi32);
-
-// SIMD_IWRAPPER_2_16(unpackhi_epi16);
-// SIMD_IWRAPPER_2_64(unpackhi_epi64);
-// SIMD_IWRAPPER_2_8(unpackhi_epi8);
-// SIMD_IWRAPPER_2_16(unpacklo_epi16);
-// SIMD_IWRAPPER_2_64(unpacklo_epi64);
-// SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return __conv(_mm512_mask_i32gather_ps(
-        _mm512_setzero_ps(), __mmask16(0xf), __conv(idx), p, static_cast<int>(ScaleT)));
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
-{
-    __mmask16 m = 0xf;
-    m           = _mm512_mask_test_epi32_mask(
-        m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000));
-    return __conv(
-        _mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT)));
-}
-
-// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-// {
-//     __mmask64 m = 0xffffull;
-//     return static_cast<uint32_t>(
-//         _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-// }
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
-{
-    __mmask16 m = 0xf;
-    m           = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
-    _mm512_mask_storeu_ps(p, m, __conv(src));
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
-{
-    _mm512_mask_storeu_ps(p, __mmask16(0xf), __conv(a));
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
-{
-    _mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a));
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xf), -1)));
-}
-
-//=======================================================================
-// Legacy interface (available only in SIMD256 width)
-//=======================================================================
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_1I_
-#undef SIMD_WRAPPER_1I
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_3
-#undef SIMD_DWRAPPER_1_
-#undef SIMD_DWRAPPER_1
-#undef SIMD_DWRAPPER_1I_
-#undef SIMD_DWRAPPER_1I
-#undef SIMD_DWRAPPER_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_IWRAPPER_1_
-#undef SIMD_IWRAPPER_1_8
-#undef SIMD_IWRAPPER_1_16
-#undef SIMD_IWRAPPER_1_32
-#undef SIMD_IWRAPPER_1_64
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_1I_8
-#undef SIMD_IWRAPPER_1I_16
-#undef SIMD_IWRAPPER_1I_32
-#undef SIMD_IWRAPPER_1I_64
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2_8
-#undef SIMD_IWRAPPER_2_16
-#undef SIMD_IWRAPPER_2_32
-#undef SIMD_IWRAPPER_2_64
-#undef SIMD_IWRAPPER_2I
-//#undef SIMD_IWRAPPER_2I_8
-//#undef SIMD_IWRAPPER_2I_16
-//#undef SIMD_IWRAPPER_2I_32
-//#undef SIMD_IWRAPPER_2I_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl
deleted file mode 100644
index 16e59c4decb..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl
+++ /dev/null
@@ -1,196 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD128 AVX (512) implementation
-//
-// Since this implementation inherits from the AVX (2) implementation,
-// the only operations below ones that replace AVX (2) operations.
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
-
-#define SIMD_WRAPPER_1_(op, intrin, mask)                        \
-    static SIMDINLINE Float SIMDCALL op(Float a)                 \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_1I_(op, intrin, mask)                             \
-    template <int ImmT>                                                \
-    static SIMDINLINE Float SIMDCALL op(Float a)                       \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_2_(op, intrin, mask)                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_2I(op)                                                \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                  \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_WRAPPER_3_(op, intrin, mask)                                              \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)                     \
-    {                                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
-    }
-#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
-
-#define SIMD_DWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Double SIMDCALL op(Double a)               \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
-
-#define SIMD_DWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Double SIMDCALL op(Double a)                     \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
-
-#define SIMD_DWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)                \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
-
-#define SIMD_DWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)               \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
-
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
-
-#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
-
-#define SIMD_IWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)            \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
-    }
-
-SIMD_IWRAPPER_2_8(add_epi8);      // return a + b (int8)
-SIMD_IWRAPPER_2_8(adds_epu8);     // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2_64(sub_epi64);    // return a - b (int64)
-SIMD_IWRAPPER_2_8(subs_epu8);     // return (b > a) ? 0 : (a - b) (uint8)
-SIMD_IWRAPPER_2_8(packs_epi16);   // int16 --> int8    See documentation for _mm256_packs_epi16 and
-                                  // _mm512_packs_epi16
-SIMD_IWRAPPER_2_16(packs_epi32);  // int32 --> int16   See documentation for _mm256_packs_epi32 and
-                                  // _mm512_packs_epi32
-SIMD_IWRAPPER_2_8(packus_epi16);  // uint16 --> uint8  See documentation for _mm256_packus_epi16 and
-                                  // _mm512_packus_epi16
-SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and
-                                  // _mm512_packus_epi32
-SIMD_IWRAPPER_2_16(unpackhi_epi16);
-SIMD_IWRAPPER_2_64(unpackhi_epi64);
-SIMD_IWRAPPER_2_8(unpackhi_epi8);
-SIMD_IWRAPPER_2_16(unpacklo_epi16);
-SIMD_IWRAPPER_2_64(unpacklo_epi64);
-SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-{
-    __mmask64 m = 0xffffull;
-    return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-}
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_1I_
-#undef SIMD_WRAPPER_1I
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_3
-#undef SIMD_DWRAPPER_1_
-#undef SIMD_DWRAPPER_1
-#undef SIMD_DWRAPPER_1I_
-#undef SIMD_DWRAPPER_1I
-#undef SIMD_DWRAPPER_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_IWRAPPER_1_
-#undef SIMD_IWRAPPER_1_8
-#undef SIMD_IWRAPPER_1_16
-#undef SIMD_IWRAPPER_1_32
-#undef SIMD_IWRAPPER_1_64
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_1I_8
-#undef SIMD_IWRAPPER_1I_16
-#undef SIMD_IWRAPPER_1I_32
-#undef SIMD_IWRAPPER_1I_64
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2_8
-#undef SIMD_IWRAPPER_2_16
-#undef SIMD_IWRAPPER_2_32
-#undef SIMD_IWRAPPER_2_64
-#undef SIMD_IWRAPPER_2I
-//#undef SIMD_IWRAPPER_2I_8
-//#undef SIMD_IWRAPPER_2I_16
-//#undef SIMD_IWRAPPER_2I_32
-//#undef SIMD_IWRAPPER_2I_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl
deleted file mode 100644
index 1b6592e2003..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl
+++ /dev/null
@@ -1,34 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD128 AVX (512) implementation for Knights Family
-//
-// Since this implementation inherits from the AVX512Base implementation,
-// the only operations below ones that replace AVX512F / AVX512CD operations
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
deleted file mode 100644
index d0c3ecd4cf3..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
+++ /dev/null
@@ -1,826 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-using SIMD128T = SIMD128Impl::AVXImpl;
-
-//============================================================================
-// SIMD256 AVX (1) implementation
-//============================================================================
-
-#define SIMD_WRAPPER_1(op) \
-    static SIMDINLINE Float SIMDCALL op(Float const& a) { return _mm256_##op(a); }
-
-#define SIMD_WRAPPER_2(op)                                              \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
-    {                                                                   \
-        return _mm256_##op(a, b);                                       \
-    }
-
-#define SIMD_DWRAPPER_2(op)                                                \
-    static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
-    {                                                                      \
-        return _mm256_##op(a, b);                                          \
-    }
-
-#define SIMD_WRAPPER_2I(op)                                             \
-    template <int ImmT>                                                 \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
-    {                                                                   \
-        return _mm256_##op(a, b, ImmT);                                 \
-    }
-
-#define SIMD_DWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
-    {                                                                      \
-        return _mm256_##op(a, b, ImmT);                                    \
-    }
-
-#define SIMD_WRAPPER_3(op)                                                              \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
-    {                                                                                   \
-        return _mm256_##op(a, b, c);                                                    \
-    }
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
-
-#define SIMD_IWRAPPER_2(op)                                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##op(a, b);                                             \
-    }
-
-#define SIMD_IFWRAPPER_2(op, intrin)                                          \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return castps_si(intrin(castsi_ps(a), castsi_ps(b)));                 \
-    }
-
-#define SIMD_IFWRAPPER_2I(op, intrin)                                         \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return castps_si(intrin(castsi_ps(a), castsi_ps(b), ImmT));           \
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)                                         \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##intrin(a, b, ImmT);                                   \
-    }
-#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
-
-#define SIMD_IWRAPPER_3(op)                                                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
-    {                                                                                           \
-        return _mm256_##op(a, b, c);                                                            \
-    }
-
-// emulated integer simd
-#define SIMD_EMU_IWRAPPER_1(op)                             \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return Integer{                                     \
-            SIMD128T::op(a.v4[0]),                          \
-            SIMD128T::op(a.v4[1]),                          \
-        };                                                  \
-    }
-#define SIMD_EMU_IWRAPPER_1L(op, shift)                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a)              \
-    {                                                                    \
-        return Integer{                                                  \
-            SIMD128T::op(a.v4[0]),                                       \
-            SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])),    \
-        };                                                               \
-    }                                                                    \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer const& a) \
-    {                                                                    \
-        return Integer{                                                  \
-            SIMD128T::op(a),                                             \
-            SIMD128T::op(SIMD128T::template srli_si<shift>(a)),          \
-        };                                                               \
-    }
-
-#define SIMD_EMU_IWRAPPER_1I(op)                            \
-    template <int ImmT>                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return Integer{                                     \
-            SIMD128T::template op<ImmT>(a.v4[0]),           \
-            SIMD128T::template op<ImmT>(a.v4[1]),           \
-        };                                                  \
-    }
-
-#define SIMD_EMU_IWRAPPER_2(op)                                               \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return Integer{                                                       \
-            SIMD128T::op(a.v4[0], b.v4[0]),                                   \
-            SIMD128T::op(a.v4[1], b.v4[1]),                                   \
-        };                                                                    \
-    }
-
-#define SIMD_EMU_IWRAPPER_2I(op)                                              \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return Integer{                                                       \
-            SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),                     \
-            SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),                     \
-        };                                                                    \
-    }
-
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps); // return a + b
-SIMD_WRAPPER_2(div_ps); // return a / b
-
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
-                                          Float const& b,
-                                          Float const& c) // return (a * b) + c
-{
-    return add_ps(mul_ps(a, b), c);
-}
-
-static SIMDINLINE Float SIMDCALL fmsub_ps(Float const& a,
-                                          Float const& b,
-                                          Float const& c) // return (a * b) - c
-{
-    return sub_ps(mul_ps(a, b), c);
-}
-
-SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);   // return a * b
-SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
-SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);   // return a - b
-
-template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
-{
-    return _mm256_round_ps(a, static_cast<int>(RMT));
-}
-
-static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
-{
-    return round_ps<RoundMode::CEIL_NOEXC>(a);
-}
-static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
-{
-    return round_ps<RoundMode::FLOOR_NOEXC>(a);
-}
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
-SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
-SIMD_EMU_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_EMU_IWRAPPER_2(mullo_epi32);
-SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
-SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
-SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(and_ps);                         // return a & b       (float treated as int)
-SIMD_IFWRAPPER_2(and_si, _mm256_and_ps);        // return a & b       (int)
-SIMD_WRAPPER_2(andnot_ps);                      // return (~a) & b    (float treated as int)
-SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps);  // return (~a) & b    (int)
-SIMD_WRAPPER_2(or_ps);                          // return a | b       (float treated as int)
-SIMD_IFWRAPPER_2(or_si, _mm256_or_ps);          // return a | b       (int)
-SIMD_WRAPPER_2(xor_ps);                         // return a ^ b       (float treated as int)
-SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps);        // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
-
-static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const& vA,
-                                              Integer const& vCount) // return a << b      (uint32)
-{
-    int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi      = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow     = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi  = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
-    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
-    aHi     = _mm_extract_epi32(vAHi, 0);
-    countHi = _mm_extract_epi32(vCountHi, 0);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
-    aLow     = _mm_extract_epi32(vALow, 0);
-    countLow = _mm_extract_epi32(vCountLow, 0);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 0);
-
-    aHi     = _mm_extract_epi32(vAHi, 1);
-    countHi = _mm_extract_epi32(vCountHi, 1);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
-    aLow     = _mm_extract_epi32(vALow, 1);
-    countLow = _mm_extract_epi32(vCountLow, 1);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 1);
-
-    aHi     = _mm_extract_epi32(vAHi, 2);
-    countHi = _mm_extract_epi32(vCountHi, 2);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
-    aLow     = _mm_extract_epi32(vALow, 2);
-    countLow = _mm_extract_epi32(vCountLow, 2);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 2);
-
-    aHi     = _mm_extract_epi32(vAHi, 3);
-    countHi = _mm_extract_epi32(vCountHi, 3);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
-    aLow     = _mm_extract_epi32(vALow, 3);
-    countLow = _mm_extract_epi32(vCountLow, 3);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 3);
-
-    __m256i ret = _mm256_set1_epi32(0);
-    ret         = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret         = _mm256_insertf128_si256(ret, vALow, 0);
-    return ret;
-}
-
-SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT   (int32)
-SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT   (uint32)
-SIMD_EMU_IWRAPPER_1I(srli_si);    // return a >> (ImmT*8) (uint)
-
-template <int ImmT> // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
-{
-    return castsi_ps(srli_si<ImmT>(castps_si(a)));
-}
-
-static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const& vA,
-                                              Integer const& vCount) // return a >> b      (uint32)
-{
-    int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi      = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow     = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi  = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
-    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
-    aHi     = _mm_extract_epi32(vAHi, 0);
-    countHi = _mm_extract_epi32(vCountHi, 0);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
-    aLow     = _mm_extract_epi32(vALow, 0);
-    countLow = _mm_extract_epi32(vCountLow, 0);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 0);
-
-    aHi     = _mm_extract_epi32(vAHi, 1);
-    countHi = _mm_extract_epi32(vCountHi, 1);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
-    aLow     = _mm_extract_epi32(vALow, 1);
-    countLow = _mm_extract_epi32(vCountLow, 1);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 1);
-
-    aHi     = _mm_extract_epi32(vAHi, 2);
-    countHi = _mm_extract_epi32(vCountHi, 2);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
-    aLow     = _mm_extract_epi32(vALow, 2);
-    countLow = _mm_extract_epi32(vCountLow, 2);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 2);
-
-    aHi     = _mm_extract_epi32(vAHi, 3);
-    countHi = _mm_extract_epi32(vCountHi, 3);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
-    aLow     = _mm_extract_epi32(vALow, 3);
-    countLow = _mm_extract_epi32(vCountLow, 3);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 3);
-
-    __m256i ret = _mm256_set1_epi32(0);
-    ret         = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret         = _mm256_insertf128_si256(ret, vALow, 0);
-    return ret;
-}
-
-//-----------------------------------------------------------------------
-// Conversion operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
-{
-    return _mm256_castpd_ps(a);
-}
-
-static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
-{
-    return _mm256_castps_si256(a);
-}
-
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
-{
-    return _mm256_castsi256_pd(a);
-}
-
-static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
-{
-    return _mm256_castps_pd(a);
-}
-
-static SIMDINLINE Integer SIMDCALL castpd_si(Double const& a) // return *(Integer*)(&a)
-{
-    return _mm256_castpd_si256(a);
-}
-
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
-{
-    return _mm256_castsi256_ps(a);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
-{
-    return _mm256_cvtepi32_ps(a);
-}
-
-SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);  // return (int16)a    (uint8 --> int16)
-SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);  // return (int32)a    (uint8 --> int32)
-SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a    (uint16 --> int32)
-SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a    (uint16 --> int64)
-SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a    (uint32 --> int64)
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
-{
-    return _mm256_cvtps_epi32(a);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
-{
-    return _mm256_cvttps_epi32(a);
-}
-
-//-----------------------------------------------------------------------
-// Comparison operations
-//-----------------------------------------------------------------------
-template <CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
-{
-    return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
-}
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::LT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::GT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::NEQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::EQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::GE_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::LE_OQ>(a, b);
-}
-
-SIMD_EMU_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
-SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
-SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
-SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
-SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL
-                       testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
-{
-    return 0 != _mm256_testz_ps(a, b);
-}
-
-static SIMDINLINE bool SIMDCALL
-                       testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
-{
-    return 0 != _mm256_testz_si256(a, b);
-}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2I(blend_ps);                       // return ImmT ? b : a  (float)
-SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a  (int32)
-SIMD_WRAPPER_3(blendv_ps);                       // return mask ? b : a  (float)
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
-                                                Integer const& b,
-                                                Float const&   mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
-}
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
-                                                Integer const& b,
-                                                Integer const& mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
-{
-    return _mm256_broadcast_ss(p);
-}
-
-SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_EMU_IWRAPPER_2(
-    packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_EMU_IWRAPPER_2(
-    packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
-{
-    return _mm256_permute_ps(a, ImmT);
-}
-
-static SIMDINLINE Integer SIMDCALL permute_epi32(
-    Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
-{
-    Integer result;
-
-    // Ugly slow implementation
-    uint32_t const* pA      = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* pSwiz   = reinterpret_cast<uint32_t const*>(&swiz);
-    uint32_t*       pResult = reinterpret_cast<uint32_t*>(&result);
-
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        pResult[i] = pA[0xF & pSwiz[i]];
-    }
-
-    return result;
-}
-
-static SIMDINLINE Float SIMDCALL
-                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    Float result;
-
-    // Ugly slow implementation
-    float const*    pA      = reinterpret_cast<float const*>(&a);
-    uint32_t const* pSwiz   = reinterpret_cast<uint32_t const*>(&swiz);
-    float*          pResult = reinterpret_cast<float*>(&result);
-
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        pResult[i] = pA[0xF & pSwiz[i]];
-    }
-
-    return result;
-}
-
-SIMD_WRAPPER_2I(permute2f128_ps);
-SIMD_DWRAPPER_2I(permute2f128_pd);
-SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
-
-SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
-{
-    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
-}
-SIMD_EMU_IWRAPPER_2(shuffle_epi8);
-SIMD_DWRAPPER_2I(shuffle_pd);
-SIMD_WRAPPER_2I(shuffle_ps);
-SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
-SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
-SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
-SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
-SIMD_DWRAPPER_2(unpackhi_pd);
-SIMD_WRAPPER_2(unpackhi_ps);
-SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
-SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
-SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
-SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
-SIMD_DWRAPPER_2(unpacklo_pd);
-SIMD_WRAPPER_2(unpacklo_ps);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    uint32_t* pOffsets = (uint32_t*)&idx;
-    Float     vResult;
-    float*    pResult = (float*)&vResult;
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        uint32_t offset = pOffsets[i];
-        offset          = offset * static_cast<uint32_t>(ScaleT);
-        pResult[i]      = *(float const*)(((uint8_t const*)p + offset));
-    }
-
-    return vResult;
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return i32gather_ps<ScaleT>(p, idx);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
-{
-    return broadcast_ss(p);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return _mm256_load_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return _mm256_load_si256(&p->v);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return _mm256_loadu_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return _mm256_lddqu_si256(&p->v);
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
-{
-    uint32_t* pOffsets = (uint32_t*)&idx;
-    Float     vResult  = old;
-    float*    pResult  = (float*)&vResult;
-    unsigned long index = 0;
-    uint32_t  umask = movemask_ps(mask);
-    while (_BitScanForward(&index, umask))
-    {
-        umask &= ~(1 << index);
-        uint32_t offset = pOffsets[index];
-        offset          = offset * static_cast<uint32_t>(ScaleT);
-        pResult[index]  = *(float const*)(((uint8_t const*)p + offset));
-    }
-
-    return vResult;
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
-{
-    return mask_i32gather_ps<ScaleT>(old, p, idx, mask);
-}
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
-{
-    _mm256_maskstore_ps(p, mask, src);
-}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
-{
-    return SIMD128T::movemask_epi8(a.v4[0]) | (SIMD128T::movemask_epi8(a.v4[1]) << 16);
-}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
-{
-    return static_cast<uint32_t>(_mm256_movemask_pd(a));
-}
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
-{
-    return static_cast<uint32_t>(_mm256_movemask_ps(a));
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
-{
-    return _mm256_set1_epi32(i);
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
-{
-    return _mm256_set1_epi8(i);
-}
-
-static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
-{
-    return _mm256_set1_ps(f);
-}
-
-static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
-{
-    return _mm256_setzero_ps();
-}
-
-static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
-{
-    return _mm256_setzero_si256();
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
-{
-    _mm256_store_ps(p, a);
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
-{
-    _mm256_store_si256(&p->v, a);
-}
-
-static SIMDINLINE void SIMDCALL
-                       stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
-{
-    _mm256_stream_ps(p, a);
-}
-
-//=======================================================================
-// Legacy interface (available only in SIMD256 width)
-//=======================================================================
-
-static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const* p)
-{
-    return _mm256_broadcast_ps(&p->v);
-}
-
-template <int ImmT>
-static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const& a)
-{
-    return _mm256_extractf128_pd(a, ImmT);
-}
-
-template <int ImmT>
-static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const& a)
-{
-    return _mm256_extractf128_ps(a, ImmT);
-}
-
-template <int ImmT>
-static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const& a)
-{
-    return _mm256_extractf128_si256(a, ImmT);
-}
-
-template <int ImmT>
-static SIMDINLINE Double SIMDCALL insertf128_pd(Double const& a, SIMD128Impl::Double const& b)
-{
-    return _mm256_insertf128_pd(a, b, ImmT);
-}
-
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL insertf128_ps(Float const& a, SIMD128Impl::Float const& b)
-{
-    return _mm256_insertf128_ps(a, b, ImmT);
-}
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const& a, SIMD128Impl::Integer const& b)
-{
-    return _mm256_insertf128_si256(a, b, ImmT);
-}
-
-#ifndef _mm256_set_m128i
-#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
-    _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
-#endif
-
-#ifndef _mm256_loadu2_m128i
-#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
-                            /* SIMD128Impl::Integer const* */ loaddr) \
-    _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
-#endif
-
-static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi,
-                                             SIMD128Impl::Integer const* plo)
-{
-    return _mm256_loadu2_m128i(&phi->v, &plo->v);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
-{
-    return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
-{
-    return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer* phi,
-                                           SIMD128Impl::Integer* plo,
-                                           Integer const&        src)
-{
-    _mm256_storeu2_m128i(&phi->v, &plo->v, src);
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    Integer       vec = set1_epi32(mask);
-    const Integer bit = set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-    vec               = and_si(vec, bit);
-    vec               = cmplt_epi32(setzero_si(), vec);
-    return castsi_ps(vec);
-}
-
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IFWRAPPER_2
-#undef SIMD_IFWRAPPER_2I
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_2I_
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_3
-#undef SIMD_EMU_IWRAPPER_1
-#undef SIMD_EMU_IWRAPPER_1I
-#undef SIMD_EMU_IWRAPPER_2
-#undef SIMD_EMU_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
deleted file mode 100644
index 8fce96dcea4..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
+++ /dev/null
@@ -1,255 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX2_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD256 AVX (2) implementation
-//
-// Since this implementation inherits from the AVX (1) implementation,
-// the only operations below ones that replace AVX (1) operations.
-// Mostly these are integer operations that are no longer emulated with SSE
-//============================================================================
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
-
-#define SIMD_IWRAPPER_1L(op)                                \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return _mm256_##op(_mm256_castsi256_si128(a));      \
-    }
-
-#define SIMD_IWRAPPER_1I(op)                                \
-    template <int ImmT>                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return _mm256_##op(a, ImmT);                        \
-    }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)                       \
-    template <int ImmT>                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return _mm256_##intrin(a, ImmT);                    \
-    }
-
-#define SIMD_IWRAPPER_2_(op, intrin)                                          \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##intrin(a, b);                                         \
-    }
-
-#define SIMD_IWRAPPER_2(op)                                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##op(a, b);                                             \
-    }
-
-#define SIMD_IWRAPPER_2I(op)                                                  \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##op(a, b, ImmT);                                       \
-    }
-
-#define SIMD_IWRAPPER_2I(op)                                                  \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##op(a, b, ImmT);                                       \
-    }
-
-
-//-----------------------------------------------------------------------
-// Floating point arithmetic operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
-                                          Float const& b,
-                                          Float const& c) // return (a * b) + c
-{
-    return _mm256_fmadd_ps(a, b, c);
-}
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
-SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2(mullo_epi32);
-SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
-SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-#if _MSC_VER >= 1920 // && _MSC_FULL_VER < [some_fixed_version]
-// Some versions of MSVC 2019 don't handle constant folding of and_si() correctly.
-// Using and_ps instead inhibits the compiler's constant folding and actually issues
-// the and intrinsic even though both inputs are constant values.
-#else
-// Use native integer and intrinsic
-SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b       (int)
-#endif
-SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si, or_si256);         // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si, xor_si256);       // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I(slli_epi32);           // return a << ImmT
-SIMD_IWRAPPER_2(sllv_epi32);            // return a << b      (uint32)
-SIMD_IWRAPPER_1I(srai_epi32);           // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I(srli_epi32);           // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_2(srlv_epi32);            // return a >> b      (uint32)
-SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
-
-template <int ImmT> // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
-{
-    return castsi_ps(srli_si<ImmT>(castps_si(a)));
-}
-
-//-----------------------------------------------------------------------
-// Conversion operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1L(cvtepu8_epi16);  // return (int16)a    (uint8 --> int16)
-SIMD_IWRAPPER_1L(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
-SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
-SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
-SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
-
-//-----------------------------------------------------------------------
-// Comparison operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
-SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
-SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
-SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
-SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
-SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
-SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
-SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
-
-static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const& a,
-                                               Integer const& b) // return a < b (int32)
-{
-    return cmpgt_epi32(b, a);
-}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a  (int32)
-SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
-{
-    return _mm256_permute_ps(a, ImmT);
-}
-
-SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
-
-static SIMDINLINE Float SIMDCALL
-                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    return _mm256_permutevar8x32_ps(a, swiz);
-}
-
-SIMD_IWRAPPER_1I(shuffle_epi32);
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
-{
-    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
-}
-SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_IWRAPPER_2(unpackhi_epi16);
-SIMD_IWRAPPER_2(unpackhi_epi32);
-SIMD_IWRAPPER_2(unpackhi_epi64);
-SIMD_IWRAPPER_2(unpackhi_epi8);
-SIMD_IWRAPPER_2(unpacklo_epi16);
-SIMD_IWRAPPER_2(unpacklo_epi32);
-SIMD_IWRAPPER_2(unpacklo_epi64);
-SIMD_IWRAPPER_2(unpacklo_epi8);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
-}
-
-#if _MSC_VER == 1920 // && _MSC_FULL_VER < [some_fixed_version]
-// Don't use _mm256_mask_i32gather_ps(), the compiler doesn't preserve the mask register
-// correctly in early versions of MSVC 2019
-#else
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
-{
-    // g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
-    // Only for this intrinsic - not sure why. :(
-    return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
-}
-#endif
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
-{
-    return static_cast<uint32_t>(_mm256_movemask_epi8(a));
-}
-
-//=======================================================================
-// Legacy interface (available only in SIMD256 width)
-//=======================================================================
-
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_1L
-#undef SIMD_IWRAPPER_1I
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
deleted file mode 100644
index 4c883b11a25..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
+++ /dev/null
@@ -1,349 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD256 AVX (512) implementation
-//
-// Since this implementation inherits from the AVX (2) implementation,
-// the only operations below ones that replace AVX (2) operations.
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
-
-private:
-static SIMDINLINE __m512 __conv(Float r)
-{
-    return _mm512_castps256_ps512(r.v);
-}
-static SIMDINLINE __m512d __conv(Double r)
-{
-    return _mm512_castpd256_pd512(r.v);
-}
-static SIMDINLINE __m512i __conv(Integer r)
-{
-    return _mm512_castsi256_si512(r.v);
-}
-static SIMDINLINE Float __conv(__m512 r)
-{
-    return _mm512_castps512_ps256(r);
-}
-static SIMDINLINE Double __conv(__m512d r)
-{
-    return _mm512_castpd512_pd256(r);
-}
-static SIMDINLINE Integer __conv(__m512i r)
-{
-    return _mm512_castsi512_si256(r);
-}
-
-public:
-#define SIMD_WRAPPER_1_(op, intrin, mask)                        \
-    static SIMDINLINE Float SIMDCALL op(Float a)                 \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
-
-#define SIMD_WRAPPER_1I_(op, intrin, mask)                             \
-    template <int ImmT>                                                \
-    static SIMDINLINE Float SIMDCALL op(Float a)                       \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
-
-#define SIMD_WRAPPER_2_(op, intrin, mask)                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
-
-#define SIMD_WRAPPER_2I(op)                                                 \
-    template <int ImmT>                                                     \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
-    {                                                                       \
-        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_WRAPPER_3_(op, intrin, mask)                                              \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)                     \
-    {                                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
-    }
-#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
-
-#define SIMD_DWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)               \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
-
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
-
-#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
-
-#define SIMD_IWRAPPER_2I(op)                                                \
-    template <int ImmT>                                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
-    {                                                                       \
-        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \
-    }
-
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);                                 // return a + b
-SIMD_WRAPPER_2(div_ps);                                 // return a / b
-SIMD_WRAPPER_3(fmadd_ps);                               // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);                               // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);                                 // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);                                 // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);                                 // return a * b
-SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xff));     // return 1.0f / a
-SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xff)); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);                                 // return a - b
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
-SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
-
-// SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
-// SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2_32(mullo_epi32);
-SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
-
-// SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
-// SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff));       // return a & b       (int)
-SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff));         // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff));       // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
-SIMD_IWRAPPER_2_32(sllv_epi32);  // return a << b      (uint32)
-SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_2_32(srlv_epi32);  // return a >> b      (uint32)
-
-// use AVX2 version
-// SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
-
-//-----------------------------------------------------------------------
-// Conversion operations (Use AVX2 versions)
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
-// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
-// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
-// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
-// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
-
-//-----------------------------------------------------------------------
-// Comparison operations (Use AVX2 versions
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
-//
-// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
-//{
-//    return cmpgt_epi32(b, a);
-//}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16
-// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation
-// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 -->
-// uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-// SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for
-// _mm256_packus_epi32 and _mm512_packus_epi32
-
-// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
-
-// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for
-// each 32-bit lane i (float)
-//{
-//    return _mm256_permutevar8x32_ps(a, swiz);
-//}
-
-SIMD_IWRAPPER_1I_32(shuffle_epi32);
-// template<int ImmT>
-// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
-//{
-//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
-//}
-// SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_IWRAPPER_2_32(unpackhi_epi32);
-SIMD_IWRAPPER_2_32(unpacklo_epi32);
-
-// SIMD_IWRAPPER_2_16(unpackhi_epi16);
-// SIMD_IWRAPPER_2_64(unpackhi_epi64);
-// SIMD_IWRAPPER_2_8(unpackhi_epi8);
-// SIMD_IWRAPPER_2_16(unpacklo_epi16);
-// SIMD_IWRAPPER_2_64(unpacklo_epi64);
-// SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return __conv(_mm512_mask_i32gather_ps(
-        _mm512_setzero_ps(), __mmask16(0xff), __conv(idx), p, static_cast<int>(ScaleT)));
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
-{
-    __mmask16 m = 0xff;
-    m           = _mm512_mask_test_epi32_mask(
-        m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000));
-    return __conv(
-        _mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT)));
-}
-
-// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-// {
-//     __mmask64 m = 0xffffffffull;
-//     return static_cast<uint32_t>(
-//         _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-// }
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
-{
-    __mmask16 m = 0xff;
-    m           = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
-    _mm512_mask_storeu_ps(p, m, __conv(src));
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
-{
-    _mm512_mask_storeu_ps(p, __mmask16(0xff), __conv(a));
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
-{
-    _mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a));
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xff), -1)));
-}
-
-//=======================================================================
-// Legacy interface (available only in SIMD256 width)
-//=======================================================================
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_1I_
-#undef SIMD_WRAPPER_1I
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1_
-#undef SIMD_IWRAPPER_1_32
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_1I_32
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2_32
-#undef SIMD_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl
deleted file mode 100644
index 1acdc7e07ff..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl
+++ /dev/null
@@ -1,129 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD256 AVX (512) implementation for Core processors
-//
-// Since this implementation inherits from the AVX (2) implementation,
-// the only operations below ones that replace AVX (2) operations.
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
-
-#define SIMD_DWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Double SIMDCALL op(Double a)               \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
-
-#define SIMD_DWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Double SIMDCALL op(Double a)                     \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
-
-#define SIMD_DWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)                \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
-
-#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
-
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
-
-#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
-
-SIMD_IWRAPPER_2_8(add_epi8);      // return a + b (int8)
-SIMD_IWRAPPER_2_8(adds_epu8);     // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2_64(sub_epi64);    // return a - b (int64)
-SIMD_IWRAPPER_2_8(subs_epu8);     // return (b > a) ? 0 : (a - b) (uint8)
-SIMD_IWRAPPER_2_8(packs_epi16);   // int16 --> int8    See documentation for _mm256_packs_epi16 and
-                                  // _mm512_packs_epi16
-SIMD_IWRAPPER_2_16(packs_epi32);  // int32 --> int16   See documentation for _mm256_packs_epi32 and
-                                  // _mm512_packs_epi32
-SIMD_IWRAPPER_2_8(packus_epi16);  // uint16 --> uint8  See documentation for _mm256_packus_epi16 and
-                                  // _mm512_packus_epi16
-SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and
-                                  // _mm512_packus_epi32
-SIMD_IWRAPPER_2_16(unpackhi_epi16);
-SIMD_IWRAPPER_2_64(unpackhi_epi64);
-SIMD_IWRAPPER_2_8(unpackhi_epi8);
-SIMD_IWRAPPER_2_16(unpacklo_epi16);
-SIMD_IWRAPPER_2_64(unpacklo_epi64);
-SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-{
-    __mmask64 m = 0xffffffffull;
-    return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-}
-
-#undef SIMD_DWRAPPER_1_
-#undef SIMD_DWRAPPER_1
-#undef SIMD_DWRAPPER_1I_
-#undef SIMD_DWRAPPER_1I
-#undef SIMD_DWRAPPER_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_IWRAPPER_1_
-#undef SIMD_IWRAPPER_1_8
-#undef SIMD_IWRAPPER_1_16
-#undef SIMD_IWRAPPER_1_64
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_1I_8
-#undef SIMD_IWRAPPER_1I_16
-#undef SIMD_IWRAPPER_1I_64
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2_8
-#undef SIMD_IWRAPPER_2_16
-#undef SIMD_IWRAPPER_2_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl
deleted file mode 100644
index 52b6ca2b61e..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl
+++ /dev/null
@@ -1,34 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD256 AVX (512) implementation for Knights Family
-//
-// Since this implementation inherits from the AVX (2) implementation,
-// the only operations below ones that replace AVX (2) operations.
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
deleted file mode 100644
index 5053275e8d6..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ /dev/null
@@ -1,699 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
-// gcc as of 7.1 was missing these intrinsics
-#ifndef _mm512_cmpneq_ps_mask
-#define _mm512_cmpneq_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_NEQ_UQ)
-#endif
-
-#ifndef _mm512_cmplt_ps_mask
-#define _mm512_cmplt_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_LT_OS)
-#endif
-
-#ifndef _mm512_cmplt_pd_mask
-#define _mm512_cmplt_pd_mask(a, b) _mm512_cmp_pd_mask((a), (b), _CMP_LT_OS)
-#endif
-
-#endif
-
-//============================================================================
-// SIMD16 AVX512 (F) implementation (compatible with Knights and Core
-// processors)
-//
-//============================================================================
-
-static const int TARGET_SIMD_WIDTH = 16;
-using SIMD256T                     = SIMD256Impl::AVX2Impl;
-
-#define SIMD_WRAPPER_1_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
-
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
-
-#define SIMD_WRAPPER_2_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
-
-#define SIMD_WRAPPERI_2_(op, intrin)                                          \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                     \
-    {                                                                         \
-        return _mm512_castsi512_ps(                                           \
-            _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
-    }
-
-#define SIMD_DWRAPPER_2(op) \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
-
-#define SIMD_WRAPPER_2I_(op, intrin)                      \
-    template <int ImmT>                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
-    {                                                     \
-        return _mm512_##intrin(a, b, ImmT);               \
-    }
-#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
-
-#define SIMD_DWRAPPER_2I_(op, intrin)                        \
-    template <int ImmT>                                      \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
-    {                                                        \
-        return _mm512_##intrin(a, b, ImmT);                  \
-    }
-#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
-
-#define SIMD_WRAPPER_3(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
-#define SIMD_IWRAPPER_1_8(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1_4(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)                \
-    template <int ImmT>                              \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) \
-    {                                                \
-        return intrin(a, ImmT);                      \
-    }
-#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
-
-#define SIMD_IWRAPPER_2_(op, intrin) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
-#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
-
-#define SIMD_IWRAPPER_2_CMP(op, cmp) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
-
-#define SIMD_IFWRAPPER_2(op, intrin)                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)        \
-    {                                                                  \
-        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)                           \
-    template <int ImmT>                                         \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
-    {                                                           \
-        return _mm512_##intrin(a, b, ImmT);                     \
-    }
-#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
-
-private:
-static SIMDINLINE Integer vmask(__mmask16 m)
-{
-    return _mm512_maskz_set1_epi32(m, -1);
-}
-
-static SIMDINLINE Integer vmask(__mmask8 m)
-{
-    return _mm512_maskz_set1_epi64(m, -1LL);
-}
-
-public:
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);                       // return a + b
-SIMD_WRAPPER_2(div_ps);                       // return a / b
-SIMD_WRAPPER_3(fmadd_ps);                     // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);                     // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);                       // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);                       // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);                       // return a * b
-SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps);     // return 1.0f / a
-SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);                       // return a - b
-
-template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float a)
-{
-    return _mm512_roundscale_ps(a, static_cast<int>(RMT));
-}
-
-static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
-{
-    return round_ps<RoundMode::CEIL_NOEXC>(a);
-}
-static SIMDINLINE Float SIMDCALL floor_ps(Float a)
-{
-    return round_ps<RoundMode::FLOOR_NOEXC>(a);
-}
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
-// SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-// SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2(mullo_epi32);
-SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
-SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-// SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si, and_si512);       // return a & b       (int)
-SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si, or_si512);         // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si, xor_si512);       // return a ^ b       (int)
-
-// SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
-// SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
-// SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
-// SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
-SIMD_IWRAPPER_2(sllv_epi32);
-SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT   (uint32)
-
-#if 0
-SIMD_IWRAPPER_1I_(srli_si, srli_si512);     // return a >> (ImmT*8) (uint)
-
-template<int ImmT>                              // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
-{
-    return castsi_ps(srli_si<ImmT>(castps_si(a)));
-}
-#endif
-
-SIMD_IWRAPPER_2(srlv_epi32);
-
-//-----------------------------------------------------------------------
-// Conversion operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
-{
-    return _mm512_castpd_ps(a);
-}
-
-static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
-{
-    return _mm512_castps_si512(a);
-}
-
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
-{
-    return _mm512_castsi512_pd(a);
-}
-
-static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
-{
-    return _mm512_castps_pd(a);
-}
-
-static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
-{
-    return _mm512_castpd_si512(a);
-}
-
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
-{
-    return _mm512_castsi512_ps(a);
-}
-
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
-{
-    return _mm512_cvtepi32_ps(a);
-}
-
-// SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
-SIMD_IWRAPPER_1_4(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
-SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
-SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
-SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
-
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a    (float --> int32)
-{
-    return _mm512_cvtps_epi32(a);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvttps_epi32(Float a) // return (int32)a    (rnd_to_zero(float) --> int32)
-{
-    return _mm512_cvttps_epi32(a);
-}
-
-//-----------------------------------------------------------------------
-// Comparison operations
-//-----------------------------------------------------------------------
-template <CompareType CmpTypeT>
-static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
-{
-    return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
-}
-
-template <CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
-{
-    // Legacy vector mask generator
-    __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
-    return castsi_ps(vmask(result));
-}
-
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::LT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::GT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::NEQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::EQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::GE_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::LE_OQ>(a, b);
-}
-
-template <CompareTypeInt CmpTypeT>
-static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
-{
-    // Legacy vector mask generator
-    __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
-    return vmask(result);
-}
-template <CompareTypeInt CmpTypeT>
-static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
-{
-    // Legacy vector mask generator
-    __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
-    return vmask(result);
-}
-
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64)
-SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL testz_ps(Float a,
-                                         Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
-{
-    return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
-}
-
-static SIMDINLINE bool SIMDCALL testz_si(Integer a,
-                                         Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
-{
-    return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
-}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-template <int ImmT>
-static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a  (float)
-{
-    return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
-}
-
-template <int ImmT>
-static SIMDINLINE Integer blend_epi32(Integer a, Integer b) // return ImmT ? b : a  (int32)
-{
-    return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
-}
-
-static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a  (float)
-{
-    return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
-}
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
-                                                Integer b,
-                                                Float   mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
-}
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
-                                                Integer b,
-                                                Integer mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
-{
-    return _mm512_set1_ps(*p);
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
-{
-    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
-{
-    return _mm512_extractf64x4_pd(a, imm);
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
-{
-    return _mm512_extracti64x4_epi64(a, imm);
-}
-
-template <int imm>
-static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
-{
-    return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
-}
-
-template <int imm>
-static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
-{
-    return _mm512_insertf64x4(a, b, imm);
-}
-
-template <int imm>
-static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
-{
-    return _mm512_inserti64x4(a, b, imm);
-}
-
-// SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and
-// _mm512_packs_epi16 SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32
-// and _mm512_packs_epi32 SIMD_IWRAPPER_2(packus_epi16);  // See documentation for
-// _mm512_packus_epi16 and _mm512_packus_epi16 SIMD_IWRAPPER_2(packus_epi32);  // See documentation
-// for _mm512_packus_epi32 and _mm512_packus_epi32
-
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
-{
-    return _mm512_permute_ps(a, ImmT);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    return _mm512_permutexvar_epi32(swiz, a);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    return _mm512_permutexvar_ps(swiz, a);
-}
-
-SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
-SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
-SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
-
-SIMD_IWRAPPER_1I(shuffle_epi32);
-
-// SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_DWRAPPER_2I(shuffle_pd);
-SIMD_WRAPPER_2I(shuffle_ps);
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
-{
-    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
-}
-
-SIMD_IWRAPPER_2(unpackhi_epi16);
-
-// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
-static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
-{
-    return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
-}
-
-SIMD_IWRAPPER_2(unpackhi_epi64);
-// SIMD_IWRAPPER_2(unpackhi_epi8);
-SIMD_DWRAPPER_2(unpackhi_pd);
-SIMD_WRAPPER_2(unpackhi_ps);
-// SIMD_IWRAPPER_2(unpacklo_epi16);
-SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
-SIMD_IWRAPPER_2(unpacklo_epi64);
-// SIMD_IWRAPPER_2(unpacklo_epi8);
-SIMD_DWRAPPER_2(unpacklo_pd);
-SIMD_WRAPPER_2(unpacklo_ps);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
-{
-    return broadcast_ss(p);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return _mm512_load_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return _mm512_load_si512(&p->v);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return _mm512_loadu_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return _mm512_loadu_si512(p);
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
-{
-    __mmask16 k = _mm512_test_epi32_mask(castps_si(mask), set1_epi32(0x80000000));
-
-    return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
-}
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
-{
-    Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
-    _mm512_mask_store_ps(p, m, src);
-}
-
-// static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
-//{
-//    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
-//    return static_cast<uint64_t>(m);
-//}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
-{
-    __mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi64(0x8000000000000000LL));
-    return static_cast<uint32_t>(m);
-}
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
-{
-    __mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x80000000));
-    return static_cast<uint32_t>(m);
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all elements are same value)
-{
-    return _mm512_set1_epi64(i);
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
-{
-    return _mm512_set1_epi32(i);
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
-{
-    return _mm512_set1_epi8(i);
-}
-
-static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
-{
-    return _mm512_set1_ps(f);
-}
-
-static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double)
-{
-    return _mm512_setzero_pd();
-}
-
-static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
-{
-    return _mm512_setzero_ps();
-}
-
-static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
-{
-    return _mm512_setzero_si512();
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
-{
-    _mm512_store_ps(p, a);
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
-{
-    _mm512_store_si512(&p->v, a);
-}
-
-static SIMDINLINE void SIMDCALL
-                       storeu_si(Integer* p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
-{
-    _mm512_storeu_si512(&p->v, a);
-}
-
-static SIMDINLINE void SIMDCALL
-                       stream_ps(float* p, Float a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
-{
-    _mm512_stream_ps(p, a);
-}
-
-static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
-                                             int i14,
-                                             int i13,
-                                             int i12,
-                                             int i11,
-                                             int i10,
-                                             int i9,
-                                             int i8,
-                                             int i7,
-                                             int i6,
-                                             int i5,
-                                             int i4,
-                                             int i3,
-                                             int i2,
-                                             int i1,
-                                             int i0)
-{
-    return _mm512_set_epi32(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
-{
-    return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL set_ps(float i15,
-                                        float i14,
-                                        float i13,
-                                        float i12,
-                                        float i11,
-                                        float i10,
-                                        float i9,
-                                        float i8,
-                                        float i7,
-                                        float i6,
-                                        float i5,
-                                        float i4,
-                                        float i3,
-                                        float i2,
-                                        float i1,
-                                        float i0)
-{
-    return _mm512_set_ps(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
-{
-    return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
-}
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPERI_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_WRAPPER_2I_
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IFWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_1I
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl
deleted file mode 100644
index 82aa2bb4173..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl
+++ /dev/null
@@ -1,186 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD16 AVX512 (F) implementation for Core processors
-//
-//============================================================================
-
-#define SIMD_WRAPPER_1_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
-
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
-
-#define SIMD_WRAPPER_2_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
-
-#define SIMD_WRAPPERI_2_(op, intrin)                                          \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                     \
-    {                                                                         \
-        return _mm512_castsi512_ps(                                           \
-            _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
-    }
-
-#define SIMD_DWRAPPER_2(op) \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
-
-#define SIMD_WRAPPER_2I_(op, intrin)                      \
-    template <int ImmT>                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
-    {                                                     \
-        return _mm512_##intrin(a, b, ImmT);               \
-    }
-#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
-
-#define SIMD_DWRAPPER_2I_(op, intrin)                        \
-    template <int ImmT>                                      \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
-    {                                                        \
-        return _mm512_##intrin(a, b, ImmT);                  \
-    }
-#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
-
-#define SIMD_WRAPPER_3(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
-#define SIMD_IWRAPPER_1_8(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1_4(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)                \
-    template <int ImmT>                              \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) \
-    {                                                \
-        return intrin(a, ImmT);                      \
-    }
-#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
-
-#define SIMD_IWRAPPER_2_(op, intrin) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
-#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
-
-#define SIMD_IWRAPPER_2_CMP(op, cmp) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
-
-#define SIMD_IFWRAPPER_2(op, intrin)                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)        \
-    {                                                                  \
-        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)                           \
-    template <int ImmT>                                         \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
-    {                                                           \
-        return _mm512_##intrin(a, b, ImmT);                     \
-    }
-#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
-
-private:
-static SIMDINLINE Integer vmask(__mmask32 m)
-{
-    return _mm512_maskz_set1_epi16(m, -1);
-}
-static SIMDINLINE Integer vmask(__mmask64 m)
-{
-    return _mm512_maskz_set1_epi8(m, -1);
-}
-
-public:
-SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-SIMD_WRAPPER_2(and_ps);    // return a & b       (float treated as int)
-SIMD_WRAPPER_2(andnot_ps); // return (~a) & b    (float treated as int)
-SIMD_WRAPPER_2(or_ps);     // return a | b       (float treated as int)
-SIMD_WRAPPER_2(xor_ps);    // return a ^ b       (float treated as int)
-
-SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a    (uint8 --> int16)
-
-template <CompareTypeInt CmpTypeT>
-static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
-{
-    // Legacy vector mask generator
-    __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
-    return vmask(result);
-}
-template <CompareTypeInt CmpTypeT>
-static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
-{
-    // Legacy vector mask generator
-    __mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
-    return vmask(result);
-}
-
-SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>);   // return a == b (int8)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>);   // return a > b (int8)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
-
-SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32
-
-SIMD_IWRAPPER_2(unpackhi_epi8);  // See documentation for _mm512_unpackhi_epi8
-SIMD_IWRAPPER_2(unpacklo_epi16); // See documentation for _mm512_unpacklo_epi16
-SIMD_IWRAPPER_2(unpacklo_epi8);  // See documentation for _mm512_unpacklo_epi8
-
-SIMD_IWRAPPER_2(shuffle_epi8);
-
-static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
-{
-    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
-    return static_cast<uint64_t>(m);
-}
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPERI_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_WRAPPER_2I_
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IFWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_1I
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
deleted file mode 100644
index 9ec3ff6c6b1..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
+++ /dev/null
@@ -1,132 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD16 AVX512 (F) implementation for Knights Family Processors
-//
-//============================================================================
-
-#define SIMD_WRAPPER_1_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
-
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
-
-#define SIMD_WRAPPER_2_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
-
-#define SIMD_WRAPPERI_2_(op, intrin)                                          \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                     \
-    {                                                                         \
-        return _mm512_castsi512_ps(                                           \
-            _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
-    }
-
-#define SIMD_DWRAPPER_2(op) \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
-
-#define SIMD_WRAPPER_2I_(op, intrin)                      \
-    template <int ImmT>                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
-    {                                                     \
-        return _mm512_##intrin(a, b, ImmT);               \
-    }
-#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
-
-#define SIMD_DWRAPPER_2I_(op, intrin)                        \
-    template <int ImmT>                                      \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
-    {                                                        \
-        return _mm512_##intrin(a, b, ImmT);                  \
-    }
-#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
-
-#define SIMD_WRAPPER_3(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
-#define SIMD_IWRAPPER_1_8(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1_4(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)                \
-    template <int ImmT>                              \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) \
-    {                                                \
-        return intrin(a, ImmT);                      \
-    }
-#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
-
-#define SIMD_IWRAPPER_2_(op, intrin) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
-#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
-
-#define SIMD_IWRAPPER_2_CMP(op, cmp) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
-
-#define SIMD_IFWRAPPER_2(op, intrin)                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)        \
-    {                                                                  \
-        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)                           \
-    template <int ImmT>                                         \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
-    {                                                           \
-        return _mm512_##intrin(a, b, ImmT);                     \
-    }
-#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
-
-SIMD_WRAPPERI_2_(and_ps, and_epi32);       // return a & b       (float treated as int)
-SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b    (float treated as int)
-SIMD_WRAPPERI_2_(or_ps, or_epi32);         // return a | b       (float treated as int)
-SIMD_WRAPPERI_2_(xor_ps, xor_epi32);       // return a ^ b       (float treated as int)
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPERI_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_WRAPPER_2I_
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IFWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_1I
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
deleted file mode 100644
index f9d4b8c3902..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
+++ /dev/null
@@ -1,27 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-// Implement mask-enabled SIMD functions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl
deleted file mode 100644
index f9d4b8c3902..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl
+++ /dev/null
@@ -1,27 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-// Implement mask-enabled SIMD functions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl
deleted file mode 100644
index f9d4b8c3902..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl
+++ /dev/null
@@ -1,27 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-// Implement mask-enabled SIMD functions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
deleted file mode 100644
index ec905505dc4..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ /dev/null
@@ -1,852 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD16 AVX (1) implementation
-//============================================================================
-
-static const int TARGET_SIMD_WIDTH = 8;
-using SIMD128T                     = SIMD128Impl::AVXImpl;
-
-#define SIMD_WRAPPER_1(op)                              \
-    static SIMDINLINE Float SIMDCALL op(Float const& a) \
-    {                                                   \
-        return Float{                                   \
-            SIMD256T::op(a.v8[0]),                      \
-            SIMD256T::op(a.v8[1]),                      \
-        };                                              \
-    }
-
-#define SIMD_WRAPPER_2(op)                                              \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
-    {                                                                   \
-        return Float{                                                   \
-            SIMD256T::op(a.v8[0], b.v8[0]),                             \
-            SIMD256T::op(a.v8[1], b.v8[1]),                             \
-        };                                                              \
-    }
-
-#define SIMD_WRAPPER_2I(op)                                                              \
-    template <int ImmT>                                                                  \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b)                  \
-    {                                                                                    \
-        return Float{                                                                    \
-            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
-            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
-        };                                                                               \
-    }
-
-#define SIMD_WRAPPER_2I_1(op)                                           \
-    template <int ImmT>                                                 \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
-    {                                                                   \
-        return Float{                                                   \
-            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),              \
-            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),              \
-        };                                                              \
-    }
-
-#define SIMD_WRAPPER_3(op)                                                              \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
-    {                                                                                   \
-        return Float{                                                                   \
-            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                    \
-            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                    \
-        };                                                                              \
-    }
-
-#define SIMD_IWRAPPER_1(op)                                 \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return Integer{                                     \
-            SIMD256T::op(a.v8[0]),                          \
-            SIMD256T::op(a.v8[1]),                          \
-        };                                                  \
-    }
-
-#define SIMD_IWRAPPER_2(op)                                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return Integer{                                                       \
-            SIMD256T::op(a.v8[0], b.v8[0]),                                   \
-            SIMD256T::op(a.v8[1], b.v8[1]),                                   \
-        };                                                                    \
-    }
-
-#define SIMD_IWRAPPER_2I(op)                                                             \
-    template <int ImmT>                                                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b)            \
-    {                                                                                    \
-        return Integer{                                                                  \
-            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
-            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
-        };                                                                               \
-    }
-
-#define SIMD_IWRAPPER_2I_1(op)                                                \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return Integer{                                                       \
-            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),                    \
-            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),                    \
-        };                                                                    \
-    }
-
-#define SIMD_IWRAPPER_2I_2(op)                                                \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return Integer{                                                       \
-            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),              \
-            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),       \
-        };                                                                    \
-    }
-
-#define SIMD_IWRAPPER_3(op)                                                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
-    {                                                                                           \
-        return Integer{                                                                         \
-            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                            \
-            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                            \
-        };                                                                                      \
-    }
-
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);   // return a + b
-SIMD_WRAPPER_2(div_ps);   // return a / b
-SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);   // return a * b
-SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
-SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);   // return a - b
-
-template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
-{
-    return Float{
-        SIMD256T::template round_ps<RMT>(a.v8[0]),
-        SIMD256T::template round_ps<RMT>(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
-{
-    return round_ps<RoundMode::CEIL_NOEXC>(a);
-}
-static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
-{
-    return round_ps<RoundMode::FLOOR_NOEXC>(a);
-}
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
-SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2(mullo_epi32);
-SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
-SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
-SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
-SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
-SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
-SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
-SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
-SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
-SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
-{
-    return Integer{
-        SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
-        SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
-    };
-}
-
-SIMD_IWRAPPER_2(sllv_epi32); // return a << b      (uint32)
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT   (int32)
-{
-    return Integer{
-        SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
-        SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
-    };
-}
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT   (uint32)
-{
-    return Integer{
-        SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
-        SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
-    };
-}
-
-template <int ImmT>                                          // for each 128-bit lane:
-static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) //  return a >> (ImmT*8) (uint)
-{
-    return Integer{
-        SIMD256T::template srli_si<ImmT>(a.v8[0]),
-        SIMD256T::template srli_si<ImmT>(a.v8[1]),
-    };
-}
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL
-                        srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
-{
-    return Float{
-        SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
-        SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
-    };
-}
-
-SIMD_IWRAPPER_2(srlv_epi32); // return a >> b      (uint32)
-
-//-----------------------------------------------------------------------
-// Conversion operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
-{
-    return Float{
-        SIMD256T::castpd_ps(a.v8[0]),
-        SIMD256T::castpd_ps(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
-{
-    return Integer{
-        SIMD256T::castps_si(a.v8[0]),
-        SIMD256T::castps_si(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
-{
-    return Double{
-        SIMD256T::castsi_pd(a.v8[0]),
-        SIMD256T::castsi_pd(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
-{
-    return Double{
-        SIMD256T::castps_pd(a.v8[0]),
-        SIMD256T::castps_pd(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
-{
-    return Float{
-        SIMD256T::castsi_ps(a.v8[0]),
-        SIMD256T::castsi_ps(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL
-                        cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
-{
-    return Float{
-        SIMD256T::cvtepi32_ps(a.v8[0]),
-        SIMD256T::cvtepi32_ps(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a    (uint8 --> int16)
-{
-    return Integer{
-        SIMD256T::cvtepu8_epi16(a.v4[0]),
-        SIMD256T::cvtepu8_epi16(a.v4[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint8 --> int32)
-{
-    return Integer{
-        SIMD256T::cvtepu8_epi32(a.v4[0]),
-        SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint16 --> int32)
-{
-    return Integer{
-        SIMD256T::cvtepu16_epi32(a.v4[0]),
-        SIMD256T::cvtepu16_epi32(a.v4[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint16 --> int64)
-{
-    return Integer{
-        SIMD256T::cvtepu16_epi64(a.v4[0]),
-        SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint32 --> int64)
-{
-    return Integer{
-        SIMD256T::cvtepu32_epi64(a.v4[0]),
-        SIMD256T::cvtepu32_epi64(a.v4[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
-{
-    return Integer{
-        SIMD256T::cvtps_epi32(a.v8[0]),
-        SIMD256T::cvtps_epi32(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
-{
-    return Integer{
-        SIMD256T::cvtps_epi32(a.v8[0]),
-        SIMD256T::cvtps_epi32(a.v8[1]),
-    };
-}
-
-//-----------------------------------------------------------------------
-// Comparison operations
-//-----------------------------------------------------------------------
-template <CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
-{
-    return Float{
-        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
-        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
-    };
-}
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::LT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::GT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::NEQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::EQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::GE_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::LE_OQ>(a, b);
-}
-
-template <CompareType CmpTypeT>
-static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
-{
-    return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
-}
-
-SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
-SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
-SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
-SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
-SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
-SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
-SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
-SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
-SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL
-                       testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
-{
-    return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
-}
-
-static SIMDINLINE bool SIMDCALL
-                       testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
-{
-    return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
-}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2I(blend_ps);     // return ImmT ? b : a  (float)
-SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a  (int32)
-SIMD_WRAPPER_3(blendv_ps);     // return mask ? b : a  (float)
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
-                                                Integer const& b,
-                                                Float const&   mask) // return mask ? b : a (int)
-{
-    return Integer{
-        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
-        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
-                                                Integer const& b,
-                                                Integer const& mask) // return mask ? b : a (int)
-{
-    return Integer{
-        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
-        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL
-                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
-{
-    float f = *p;
-    return Float{
-        SIMD256T::set1_ps(f),
-        SIMD256T::set1_ps(f),
-    };
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    return a.v8[imm];
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    return a.v8[imm];
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    return a.v8[imm];
-}
-
-template <int imm>
-static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    Float r   = a;
-    r.v8[imm] = b;
-    return r;
-}
-
-template <int imm>
-static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    Double r  = a;
-    r.v8[imm] = b;
-    return r;
-}
-
-template <int imm>
-static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    Integer r = a;
-    r.v8[imm] = b;
-    return r;
-}
-
-SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
-{
-    return Float{
-        SIMD256T::template permute_ps<ImmT>(a.v8[0]),
-        SIMD256T::template permute_ps<ImmT>(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL permute_epi32(
-    Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
-{
-    return castps_si(permute_ps(castsi_ps(a), swiz));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    const auto mask = SIMD256T::set1_epi32(7);
-
-    auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
-    auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
-
-    auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
-    auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
-
-    return Float{
-        SIMD256T::blendv_ps(
-            lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
-        SIMD256T::blendv_ps(
-            hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
-    };
-}
-
-// All of the 512-bit permute2f128_XX intrinsics do the following:
-//
-//      SELECT4(src, control) {
-//          CASE(control[1:0])
-//              0 : tmp[127:0] : = src[127:0]
-//              1 : tmp[127:0] : = src[255:128]
-//              2 : tmp[127:0] : = src[383:256]
-//              3 : tmp[127:0] : = src[511:384]
-//              ESAC
-//              RETURN tmp[127:0]
-//      }
-//
-//      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
-//      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
-//      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
-//      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
-//      dst[MAX:512] : = 0
-//
-// Since the 256-bit AVX instructions use a 4-bit control field (instead
-// of 2-bit for AVX512), we need to expand the control bits sent to the
-// AVX instructions for emulation.
-//
-template <int shuf>
-static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
-{
-    return Float{
-        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
-                                                                                        a.v8[1]),
-        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
-                                                                                        b.v8[1]),
-    };
-}
-
-template <int shuf>
-static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
-{
-    return Double{
-        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
-                                                                                        a.v8[1]),
-        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
-                                                                                        b.v8[1]),
-    };
-}
-
-template <int shuf>
-static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
-{
-    return Integer{
-        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
-                                                                                        a.v8[1]),
-        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
-                                                                                        b.v8[1]),
-    };
-}
-
-SIMD_IWRAPPER_2I_1(shuffle_epi32);
-SIMD_IWRAPPER_2I_2(shuffle_epi64);
-SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_WRAPPER_2I_1(shuffle_pd);
-SIMD_WRAPPER_2I_1(shuffle_ps);
-SIMD_IWRAPPER_2(unpackhi_epi16);
-SIMD_IWRAPPER_2(unpackhi_epi32);
-SIMD_IWRAPPER_2(unpackhi_epi64);
-SIMD_IWRAPPER_2(unpackhi_epi8);
-SIMD_WRAPPER_2(unpackhi_pd);
-SIMD_WRAPPER_2(unpackhi_ps);
-SIMD_IWRAPPER_2(unpacklo_epi16);
-SIMD_IWRAPPER_2(unpacklo_epi32);
-SIMD_IWRAPPER_2(unpacklo_epi64);
-SIMD_IWRAPPER_2(unpacklo_epi8);
-SIMD_WRAPPER_2(unpacklo_pd);
-SIMD_WRAPPER_2(unpacklo_ps);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return Float{
-        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
-        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
-    };
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return Float{
-        SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
-        SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
-{
-    return broadcast_ss(p);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return Integer{
-        SIMD256T::load_si(&p->v8[0]),
-        SIMD256T::load_si(&p->v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return Integer{
-        SIMD256T::loadu_si(&p->v8[0]),
-        SIMD256T::loadu_si(&p->v8[1]),
-    };
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
-{
-    return Float{
-        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
-        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
-    };
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
-{
-    return Float{
-        SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
-        SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
-    };
-}
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
-{
-    SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
-    SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
-}
-
-static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
-{
-    uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
-    mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
-
-    return mask;
-}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
-{
-    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
-    mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
-
-    return mask;
-}
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
-{
-    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
-    mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
-
-    return mask;
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
-{
-    return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
-{
-    return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
-}
-
-static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
-{
-    return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
-}
-
-static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
-{
-    return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
-}
-
-static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
-{
-    return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
-{
-    SIMD256T::store_ps(p, a.v8[0]);
-    SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
-{
-    SIMD256T::store_si(&p->v8[0], a.v8[0]);
-    SIMD256T::store_si(&p->v8[1], a.v8[1]);
-}
-
-static SIMDINLINE void SIMDCALL
-                       stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
-{
-    SIMD256T::stream_ps(p, a.v8[0]);
-    SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
-}
-
-static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
-                                             int i14,
-                                             int i13,
-                                             int i12,
-                                             int i11,
-                                             int i10,
-                                             int i9,
-                                             int i8,
-                                             int i7,
-                                             int i6,
-                                             int i5,
-                                             int i4,
-                                             int i3,
-                                             int i2,
-                                             int i1,
-                                             int i0)
-{
-    return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
-                   SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
-{
-    return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL set_ps(float i15,
-                                        float i14,
-                                        float i13,
-                                        float i12,
-                                        float i11,
-                                        float i10,
-                                        float i9,
-                                        float i8,
-                                        float i7,
-                                        float i6,
-                                        float i5,
-                                        float i4,
-                                        float i3,
-                                        float i2,
-                                        float i1,
-                                        float i0)
-{
-    return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
-                 SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
-}
-
-static SIMDINLINE Float SIMDCALL
-                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
-{
-    return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
-}
-
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_2I_1
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_2I_1
-#undef SIMD_IWRAPPER_3
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
deleted file mode 100644
index 473934824ee..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
+++ /dev/null
@@ -1,27 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-// no backwards compatibility for simd mask-enabled functions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
deleted file mode 100644
index 3d31b39ee55..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
+++ /dev/null
@@ -1,332 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#pragma once
-#if 0
-//===========================================================================
-// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
-//===========================================================================
-struct SIMD256 // or SIMD4 or SIMD16
-{
-    //=======================================================================
-    // SIMD Types
-    //
-    // These typedefs are examples. The SIMD256 and SIMD16 implementations will
-    // use different base types with this same naming.
-    using Float     = __m256;  // Packed single-precision float vector
-    using Double    = __m256d; // Packed double-precision float vector
-    using Integer   = __m256i; // Packed integer vector (mutable element widths)
-    using Mask      = uint8_t; // Integer representing mask bits
-
-    //=======================================================================
-    // Standard interface
-    // (available in both SIMD256 and SIMD16 widths)
-    //=======================================================================
-
-    //-----------------------------------------------------------------------
-    // Single precision floating point arithmetic operations
-    //-----------------------------------------------------------------------
-    static Float    add_ps(Float a, Float b);               // return a + b
-    static Float    div_ps(Float a, Float b);               // return a / b
-    static Float    fmadd_ps(Float a, Float b, Float c);    // return (a * b) + c
-    static Float    fmsub_ps(Float a, Float b, Float c);    // return (a * b) - c
-    static Float    max_ps(Float a, Float b);               // return (a > b) ? a : b
-    static Float    min_ps(Float a, Float b);               // return (a < b) ? a : b
-    static Float    mul_ps(Float a, Float b);               // return a * b
-    static Float    rcp_ps(Float a);                        // return 1.0f / a
-    static Float    rsqrt_ps(Float a);                      // return 1.0f / sqrt(a)
-    static Float    sub_ps(Float a, Float b);               // return a - b
-
-    enum class RoundMode
-    {
-        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
-        TO_NEG_INF      = 0x01, // Round to negative infinity
-        TO_POS_INF      = 0x02, // Round to positive infinity
-        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
-        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
-
-        RAISE_EXC       = 0x00, // Raise exception on overflow
-        NO_EXC          = 0x08, // Suppress exceptions
-
-        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
-        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
-        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
-        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
-        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
-        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
-        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
-        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
-        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
-        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
-    };
-
-    // return round_func(a)
-    //
-    // round_func is chosen on the RMT template parameter.  See the documentation
-    // for the RoundMode enumeration above.
-    template <RoundMode RMT>
-    static Float    round_ps(Float a);                  // return round(a) 
-
-
-    //-----------------------------------------------------------------------
-    // Integer (various width) arithmetic operations
-    //-----------------------------------------------------------------------
-    static Integer  abs_epi32(Integer a);               // return absolute_value(a) (int32)
-    static Integer  add_epi32(Integer a, Integer b);    // return a + b (int32)
-    static Integer  add_epi8(Integer a, Integer b);     // return a + b (int8)
-    static Integer  adds_epu8(Integer a, Integer b);    // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
-    static Integer  max_epi32(Integer a, Integer b);    // return (a > b) ? a : b (int32)
-    static Integer  max_epu32(Integer a, Integer b);    // return (a > b) ? a : b (uint32)
-    static Integer  min_epi32(Integer a, Integer b);    // return (a < b) ? a : b (int32)
-    static Integer  min_epu32(Integer a, Integer b);    // return (a < b) ? a : b (uint32)
-    static Integer  mul_epi32(Integer a, Integer b);    // return a * b (int32)
-
-    // return (a * b) & 0xFFFFFFFF
-    //
-    // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-    // and store the low 32 bits of the intermediate integers in dst.
-    static Float    mullo_epi32(Integer a, Integer b);
-
-    static Integer  sub_epi32(Integer a, Integer b);    // return a - b (int32)
-    static Integer  sub_epi64(Integer a, Integer b);    // return a - b (int64)
-    static Integer  subs_epu8(Integer a, Integer b);    // return (b > a) ? 0 : (a - b) (uint8)
-
-    //-----------------------------------------------------------------------
-    // Logical operations
-    //-----------------------------------------------------------------------
-    static Float    and_ps(Float a, Float b);           // return a & b       (float treated as int)
-    static Integer  and_si(Integer a, Integer b);       // return a & b       (int)
-    static Float    andnot_ps(Float a, Float b);        // return (~a) & b    (float treated as int)
-    static Integer  andnot_si(Integer a, Integer b);    // return (~a) & b    (int)
-    static Float    or_ps(Float a, Float b);            // return a | b       (float treated as int)
-    static Float    or_si(Integer a, Integer b);        // return a | b       (int)
-    static Float    xor_ps(Float a, Float b);           // return a ^ b       (float treated as int)
-    static Integer  xor_si(Integer a, Integer b);       // return a ^ b       (int)
-
-    //-----------------------------------------------------------------------
-    // Shift operations
-    //-----------------------------------------------------------------------
-    template<int ImmT>
-    static Integer  slli_epi32(Integer a);              // return a << ImmT
-    static Integer  sllv_epi32(Integer a, Integer b);   // return a << b
-    template<int ImmT>
-    static Integer  srai_epi32(Integer a);              // return a >> ImmT   (int32)
-    template<int ImmT>
-    static Integer  srli_epi32(Integer a);              // return a >> ImmT   (uint32)
-    template<int ImmT>                                  // for each 128-bit lane:
-    static Integer  srli_si(Integer a);                 //  return a >> (ImmT*8) (uint)
-    template<int ImmT>
-    static Float    srlisi_ps(Float a);                 // same as srli_si, but with Float cast to int
-    static Integer  srlv_epi32(Integer a, Integer b);   // return a >> b      (uint32)
-
-    //-----------------------------------------------------------------------
-    // Conversion operations
-    //-----------------------------------------------------------------------
-    static Float    castpd_ps(Double a);                // return *(Float*)(&a)
-    static Integer  castps_si(Float a);                 // return *(Integer*)(&a)
-    static Double   castsi_pd(Integer a);               // return *(Double*)(&a)
-    static Double   castps_pd(Float a);                 // return *(Double*)(&a)
-    static Float    castsi_ps(Integer a);               // return *(Float*)(&a)
-    static Float    cvtepi32_ps(Integer a);             // return (float)a    (int32 --> float)
-    static Integer  cvtepu8_epi16(Integer a);           // return (int16)a    (uint8 --> int16)
-    static Integer  cvtepu8_epi32(Integer a);           // return (int32)a    (uint8 --> int32)
-    static Integer  cvtepu16_epi32(Integer a);          // return (int32)a    (uint16 --> int32)
-    static Integer  cvtepu16_epi64(Integer a);          // return (int64)a    (uint16 --> int64)
-    static Integer  cvtepu32_epi64(Integer a);          // return (int64)a    (uint32 --> int64)
-    static Integer  cvtps_epi32(Float a);               // return (int32)a    (float --> int32)
-    static Integer  cvttps_epi32(Float a);              // return (int32)a    (rnd_to_zero(float) --> int32)
-
-    //-----------------------------------------------------------------------
-    // Comparison operations
-    //-----------------------------------------------------------------------
-
-    // Comparison types used with cmp_ps:
-    //   - ordered comparisons are always false if either operand is NaN
-    //   - unordered comparisons are always true if either operand is NaN
-    //   - signaling comparisons raise an exception if either operand is NaN
-    //   - non-signaling comparisons will never raise an exception
-    // 
-    // Ordered:     return (a != NaN) && (b != NaN) && (a cmp b)
-    // Unordered:   return (a == NaN) || (b == NaN) || (a cmp b)
-    enum class CompareType
-    {
-        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
-        LT_OS      = 0x01, // Less-than (ordered, signaling)
-        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
-        UNORD_Q    = 0x03, // Unordered (nonsignaling)
-        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
-        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
-        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
-        ORD_Q      = 0x07, // Ordered (nonsignaling)
-        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
-        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
-        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
-        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
-        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
-        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
-        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
-        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
-        EQ_OS      = 0x10, // Equal (ordered, signaling)
-        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
-        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
-        UNORD_S    = 0x13, // Unordered (signaling)
-        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
-        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
-        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
-        ORD_S      = 0x17, // Ordered (signaling)
-        EQ_US      = 0x18, // Equal (unordered, signaling)
-        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
-        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
-        FALSE_OS   = 0x1B, // False (ordered, signaling)
-        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
-        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
-        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
-        TRUE_US    = 0x1F, // True (unordered, signaling)
-    };
-
-    // return a (CmpTypeT) b (float)
-    //
-    // See documentation for CompareType above for valid values for CmpTypeT.
-    template<CompareType CmpTypeT>
-    static Float    cmp_ps(Float a, Float b);           // return a (CmtTypeT) b (see above)
-    static Float    cmpgt_ps(Float a, Float b);         // return cmp_ps<CompareType::GT_OQ>(a, b)
-    static Float    cmple_ps(Float a, Float b);         // return cmp_ps<CompareType::LE_OQ>(a, b)
-    static Float    cmplt_ps(Float a, Float b);         // return cmp_ps<CompareType::LT_OQ>(a, b)
-    static Float    cmpneq_ps(Float a, Float b);        // return cmp_ps<CompareType::NEQ_OQ>(a, b)
-    static Float    cmpeq_ps(Float a, Float b);         // return cmp_ps<CompareType::EQ_OQ>(a, b)
-    static Float    cmpge_ps(Float a, Float b);         // return cmp_ps<CompareType::GE_OQ>(a, b)
-    static Integer  cmpeq_epi8(Integer a, Integer b);   // return a == b (int8)
-    static Integer  cmpeq_epi16(Integer a, Integer b);  // return a == b (int16)
-    static Integer  cmpeq_epi32(Integer a, Integer b);  // return a == b (int32)
-    static Integer  cmpeq_epi64(Integer a, Integer b);  // return a == b (int64)
-    static Integer  cmpgt_epi8(Integer a, Integer b);   // return a > b (int8)
-    static Integer  cmpgt_epi16(Integer a, Integer b);  // return a > b (int16)
-    static Integer  cmpgt_epi32(Integer a, Integer b);  // return a > b (int32)
-    static Integer  cmpgt_epi64(Integer a, Integer b);  // return a > b (int64)
-    static Integer  cmplt_epi32(Integer a, Integer b);  // return a < b (int32)
-    static bool     testz_ps(Float a, Float b);         // return all_lanes_zero(a & b) ? 1 : 0 (float)
-    static bool     testz_si(Integer a, Integer b);     // return all_lanes_zero(a & b) ? 1 : 0 (int)
-
-    //-----------------------------------------------------------------------
-    // Blend / shuffle / permute operations
-    //-----------------------------------------------------------------------
-    template<int ImmT>
-    static Float    blend_ps(Float a, Float b);                     // return ImmT ? b : a  (float)
-    static Integer  blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
-    static Float    blendv_ps(Float a, Float b, Float mask);        // return mask ? b : a (float)
-    static Float    broadcast_ss(float const *p);                   // return *p (all elements in vector get same value)
-    static Integer  packs_epi16(Integer a, Integer b);              // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-    static Integer  packs_epi32(Integer a, Integer b);              // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-    static Integer  packus_epi16(Integer a, Integer b);             // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-    static Integer  packus_epi32(Integer a, Integer b);             // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-    static Float    permute_epi32(Integer a, Integer swiz);         // return a[swiz[i]] for each 32-bit lane i (int32)
-    static Float    permute_ps(Float a, Integer swiz);              // return a[swiz[i]] for each 32-bit lane i (float)
-    template<int SwizT>
-    static Integer  shuffle_epi32(Integer a, Integer b);    
-    template<int SwizT>
-    static Integer  shuffle_epi64(Integer a, Integer b);
-    static Integer  shuffle_epi8(Integer a, Integer b);
-    template<int SwizT>
-    static Float    shuffle_pd(Double a, Double b);
-    template<int SwizT>
-    static Float    shuffle_ps(Float a, Float b);
-    static Integer  unpackhi_epi16(Integer a, Integer b);
-    static Integer  unpackhi_epi32(Integer a, Integer b);
-    static Integer  unpackhi_epi64(Integer a, Integer b);
-    static Integer  unpackhi_epi8(Integer a, Integer b);
-    static Float    unpackhi_pd(Double a, Double b);
-    static Float    unpackhi_ps(Float a, Float b);
-    static Integer  unpacklo_epi16(Integer a, Integer b);
-    static Integer  unpacklo_epi32(Integer a, Integer b);
-    static Integer  unpacklo_epi64(Integer a, Integer b);
-    static Integer  unpacklo_epi8(Integer a, Integer b);
-    static Float    unpacklo_pd(Double a, Double b);
-    static Float    unpacklo_ps(Float a, Float b);
-
-    //-----------------------------------------------------------------------
-    // Load / store operations
-    //-----------------------------------------------------------------------
-    enum class ScaleFactor
-    {
-        SF_1,   // No scaling
-        SF_2,   // Scale offset by 2
-        SF_4,   // Scale offset by 4
-        SF_8,   // Scale offset by 8
-    };
-
-    template<ScaleFactor ScaleT = ScaleFactor::SF_1>
-    static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
-    static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
-    static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
-    static Integer  load_si(Integer const *p);                  // return *p
-    static Float    loadu_ps(float const *p);                   // return *p    (same as load_ps but allows for unaligned mem)
-    static Integer  loadu_si(Integer const *p);                 // return *p    (same as load_si but allows for unaligned mem)
-
-    // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-    template<int ScaleT>
-    static Float    mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
-
-    static void     maskstore_ps(float *p, Integer mask, Float src);
-    static int      movemask_epi8(Integer a);
-    static int      movemask_pd(Double a);
-    static int      movemask_ps(Float a);
-    static Integer  set1_epi32(int i);                          // return i (all elements are same value)
-    static Integer  set1_epi8(char i);                          // return i (all elements are same value)
-    static Float    set1_ps(float f);                           // return f (all elements are same value)
-    static Float    setzero_ps();                               // return 0 (float)
-    static Integer  setzero_si();                               // return 0 (integer)
-    static void     store_ps(float *p, Float a);                // *p = a   (stores all elements contiguously in memory)
-    static void     store_si(Integer *p, Integer a);            // *p = a
-    static void     stream_ps(float *p, Float a);               // *p = a   (same as store_ps, but doesn't keep memory in cache)
-
-    //=======================================================================
-    // Legacy interface (available only in SIMD256 width)
-    //=======================================================================
-
-    static Float    broadcast_ps(__m128 const *p);
-    template<int ImmT>
-    static __m128d  extractf128_pd(Double a);
-    template<int ImmT>
-    static __m128   extractf128_ps(Float a);
-    template<int ImmT>
-    static __m128i  extractf128_si(Integer a);
-    template<int ImmT>
-    static Double   insertf128_pd(Double a, __m128d b);
-    template<int ImmT>
-    static Float    insertf128_ps(Float a, __m128 b);
-    template<int ImmT>
-    static Integer  insertf128_si(Integer a, __m128i b);
-    static Integer  loadu2_si(__m128 const* phi, __m128 const* plo);
-    template<int ImmT>
-    static Double   permute2f128_pd(Double a, Double b);
-    template<int ImmT>
-    static Float    permute2f128_ps(Float a, Float b);
-    template<int ImmT>
-    static Integer  permute2f128_si(Integer a, Integer b);
-    static Integer  set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
-    static void     storeu2_si(__m128i *phi, __m128i *plo, Integer src);
-
-    //=======================================================================
-    // Advanced masking interface (currently available only in SIMD16 width)
-    //=======================================================================
-};
-#endif // #if 0
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
deleted file mode 100644
index 3ef847d4ca4..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
+++ /dev/null
@@ -1,457 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#pragma once
-
-#if !defined(__cplusplus)
-#error C++ compilation required
-#endif
-
-#include <immintrin.h>
-#include <inttypes.h>
-#include <stdint.h>
-
-#define SIMD_ARCH_AVX 0
-#define SIMD_ARCH_AVX2 1
-#define SIMD_ARCH_AVX512 2
-
-#if !defined(SIMD_ARCH)
-#define SIMD_ARCH SIMD_ARCH_AVX
-#endif
-
-#if defined(_MSC_VER)
-#define SIMDCALL __vectorcall
-#define SIMDINLINE __forceinline
-#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
-#else
-#define SIMDCALL
-#define SIMDINLINE inline
-#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
-#endif
-
-// For documentation, please see the following include...
-// #include "simdlib_interface.hpp"
-
-namespace SIMDImpl
-{
-    enum class CompareType
-    {
-        EQ_OQ    = 0x00, // Equal (ordered, nonsignaling)
-        LT_OS    = 0x01, // Less-than (ordered, signaling)
-        LE_OS    = 0x02, // Less-than-or-equal (ordered, signaling)
-        UNORD_Q  = 0x03, // Unordered (nonsignaling)
-        NEQ_UQ   = 0x04, // Not-equal (unordered, nonsignaling)
-        NLT_US   = 0x05, // Not-less-than (unordered, signaling)
-        NLE_US   = 0x06, // Not-less-than-or-equal (unordered, signaling)
-        ORD_Q    = 0x07, // Ordered (nonsignaling)
-        EQ_UQ    = 0x08, // Equal (unordered, non-signaling)
-        NGE_US   = 0x09, // Not-greater-than-or-equal (unordered, signaling)
-        NGT_US   = 0x0A, // Not-greater-than (unordered, signaling)
-        FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
-        NEQ_OQ   = 0x0C, // Not-equal (ordered, non-signaling)
-        GE_OS    = 0x0D, // Greater-than-or-equal (ordered, signaling)
-        GT_OS    = 0x0E, // Greater-than (ordered, signaling)
-        TRUE_UQ  = 0x0F, // True (unordered, non-signaling)
-        EQ_OS    = 0x10, // Equal (ordered, signaling)
-        LT_OQ    = 0x11, // Less-than (ordered, nonsignaling)
-        LE_OQ    = 0x12, // Less-than-or-equal (ordered, nonsignaling)
-        UNORD_S  = 0x13, // Unordered (signaling)
-        NEQ_US   = 0x14, // Not-equal (unordered, signaling)
-        NLT_UQ   = 0x15, // Not-less-than (unordered, nonsignaling)
-        NLE_UQ   = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
-        ORD_S    = 0x17, // Ordered (signaling)
-        EQ_US    = 0x18, // Equal (unordered, signaling)
-        NGE_UQ   = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
-        NGT_UQ   = 0x1A, // Not-greater-than (unordered, nonsignaling)
-        FALSE_OS = 0x1B, // False (ordered, signaling)
-        NEQ_OS   = 0x1C, // Not-equal (ordered, signaling)
-        GE_OQ    = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
-        GT_OQ    = 0x1E, // Greater-than (ordered, nonsignaling)
-        TRUE_US  = 0x1F, // True (unordered, signaling)
-    };
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-    enum class CompareTypeInt
-    {
-        EQ = _MM_CMPINT_EQ, // Equal
-        LT = _MM_CMPINT_LT, // Less than
-        LE = _MM_CMPINT_LE, // Less than or Equal
-        NE = _MM_CMPINT_NE, // Not Equal
-        GE = _MM_CMPINT_GE, // Greater than or Equal
-        GT = _MM_CMPINT_GT, // Greater than
-    };
-#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
-
-    enum class ScaleFactor
-    {
-        SF_1 = 1, // No scaling
-        SF_2 = 2, // Scale offset by 2
-        SF_4 = 4, // Scale offset by 4
-        SF_8 = 8, // Scale offset by 8
-    };
-
-    enum class RoundMode
-    {
-        TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
-        TO_NEG_INF     = 0x01, // Round to negative infinity
-        TO_POS_INF     = 0x02, // Round to positive infinity
-        TO_ZERO        = 0x03, // Round to 0 a.k.a. truncate
-        CUR_DIRECTION  = 0x04, // Round in direction set in MXCSR register
-
-        RAISE_EXC = 0x00, // Raise exception on overflow
-        NO_EXC    = 0x08, // Suppress exceptions
-
-        NINT        = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
-        NINT_NOEXC  = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
-        FLOOR       = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
-        FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
-        CEIL        = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
-        CEIL_NOEXC  = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
-        TRUNC       = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
-        TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
-        RINT        = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
-        NEARBYINT   = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
-    };
-
-    struct Traits
-    {
-        using CompareType = SIMDImpl::CompareType;
-        using ScaleFactor = SIMDImpl::ScaleFactor;
-        using RoundMode   = SIMDImpl::RoundMode;
-    };
-
-    // Attribute, 4-dimensional attribute in SIMD SOA layout
-    template <typename Float, typename Integer, typename Double>
-    union Vec4
-    {
-        Float   v[4];
-        Integer vi[4];
-        Double  vd[4];
-        struct
-        {
-            Float x;
-            Float y;
-            Float z;
-            Float w;
-        };
-        SIMDINLINE Float& SIMDCALL operator[](const int i) { return v[i]; }
-        SIMDINLINE Float const& SIMDCALL operator[](const int i) const { return v[i]; }
-        SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const& in)
-        {
-            v[0] = in.v[0];
-            v[1] = in.v[1];
-            v[2] = in.v[2];
-            v[3] = in.v[3];
-            return *this;
-        }
-    };
-
-    namespace SIMD128Impl
-    {
-        union Float
-        {
-            SIMDINLINE Float() = default;
-            SIMDINLINE Float(__m128 in) : v(in) {}
-            SIMDINLINE Float& SIMDCALL operator=(__m128 in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Float& SIMDCALL operator=(Float const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m128() const { return v; }
-
-            SIMDALIGN(__m128, 16) v;
-        };
-
-        union Integer
-        {
-            SIMDINLINE Integer() = default;
-            SIMDINLINE Integer(__m128i in) : v(in) {}
-            SIMDINLINE Integer& SIMDCALL operator=(__m128i in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m128i() const { return v; }
-
-            SIMDALIGN(__m128i, 16) v;
-        };
-
-        union Double
-        {
-            SIMDINLINE Double() = default;
-            SIMDINLINE Double(__m128d in) : v(in) {}
-            SIMDINLINE Double& SIMDCALL operator=(__m128d in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Double& SIMDCALL operator=(Double const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m128d() const { return v; }
-
-            SIMDALIGN(__m128d, 16) v;
-        };
-
-        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
-        using Mask = uint8_t;
-
-        static const uint32_t SIMD_WIDTH = 4;
-    } // namespace SIMD128Impl
-
-    namespace SIMD256Impl
-    {
-        union Float
-        {
-            SIMDINLINE Float() = default;
-            SIMDINLINE Float(__m256 in) : v(in) {}
-            SIMDINLINE Float(SIMD128Impl::Float const& in_lo,
-                             SIMD128Impl::Float const& in_hi = _mm_setzero_ps())
-            {
-                v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
-            }
-            SIMDINLINE Float& SIMDCALL operator=(__m256 in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Float& SIMDCALL operator=(Float const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m256() const { return v; }
-
-            SIMDALIGN(__m256, 32) v;
-            SIMD128Impl::Float v4[2];
-        };
-
-        union Integer
-        {
-            SIMDINLINE Integer() = default;
-            SIMDINLINE Integer(__m256i in) : v(in) {}
-            SIMDINLINE Integer(SIMD128Impl::Integer const& in_lo,
-                               SIMD128Impl::Integer const& in_hi = _mm_setzero_si128())
-            {
-                v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
-            }
-            SIMDINLINE Integer& SIMDCALL operator=(__m256i in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m256i() const { return v; }
-
-            SIMDALIGN(__m256i, 32) v;
-            SIMD128Impl::Integer v4[2];
-        };
-
-        union Double
-        {
-            SIMDINLINE Double() = default;
-            SIMDINLINE Double(__m256d const& in) : v(in) {}
-            SIMDINLINE Double(SIMD128Impl::Double const& in_lo,
-                              SIMD128Impl::Double const& in_hi = _mm_setzero_pd())
-            {
-                v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
-            }
-            SIMDINLINE Double& SIMDCALL operator=(__m256d in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Double& SIMDCALL operator=(Double const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m256d() const { return v; }
-
-            SIMDALIGN(__m256d, 32) v;
-            SIMD128Impl::Double v4[2];
-        };
-
-        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
-        using Mask = uint8_t;
-
-        static const uint32_t SIMD_WIDTH = 8;
-    } // namespace SIMD256Impl
-
-    namespace SIMD512Impl
-    {
-#if !(defined(__AVX512F__) || defined(_ZMMINTRIN_H_INCLUDED))
-        // Define AVX512 types if not included via immintrin.h.
-        // All data members of these types are ONLY to viewed
-        // in a debugger.  Do NOT access them via code!
-        union __m512
-        {
-        private:
-            float m512_f32[16];
-        };
-        struct __m512d
-        {
-        private:
-            double m512d_f64[8];
-        };
-
-        union __m512i
-        {
-        private:
-            int8_t   m512i_i8[64];
-            int16_t  m512i_i16[32];
-            int32_t  m512i_i32[16];
-            int64_t  m512i_i64[8];
-            uint8_t  m512i_u8[64];
-            uint16_t m512i_u16[32];
-            uint32_t m512i_u32[16];
-            uint64_t m512i_u64[8];
-        };
-
-        using __mmask16 = uint16_t;
-#endif
-
-#if defined(__INTEL_COMPILER) || (SIMD_ARCH >= SIMD_ARCH_AVX512)
-#define SIMD_ALIGNMENT_BYTES 64
-#else
-#define SIMD_ALIGNMENT_BYTES 32
-#endif
-
-        union Float
-        {
-            SIMDINLINE Float() = default;
-            SIMDINLINE Float(__m512 in) : v(in) {}
-            SIMDINLINE Float(SIMD256Impl::Float const& in_lo,
-                             SIMD256Impl::Float const& in_hi = _mm256_setzero_ps())
-            {
-                v8[0] = in_lo;
-                v8[1] = in_hi;
-            }
-            SIMDINLINE Float& SIMDCALL operator=(__m512 in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Float& SIMDCALL operator=(Float const& in)
-            {
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-                v = in.v;
-#else
-                v8[0] = in.v8[0];
-                v8[1] = in.v8[1];
-#endif
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m512() const { return v; }
-
-            SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
-            SIMD256Impl::Float v8[2];
-        };
-
-        union Integer
-        {
-            SIMDINLINE Integer() = default;
-            SIMDINLINE Integer(__m512i in) : v(in) {}
-            SIMDINLINE Integer(SIMD256Impl::Integer const& in_lo,
-                               SIMD256Impl::Integer const& in_hi = _mm256_setzero_si256())
-            {
-                v8[0] = in_lo;
-                v8[1] = in_hi;
-            }
-            SIMDINLINE Integer& SIMDCALL operator=(__m512i in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
-            {
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-                v = in.v;
-#else
-                v8[0] = in.v8[0];
-                v8[1] = in.v8[1];
-#endif
-                return *this;
-            }
-
-            SIMDINLINE SIMDCALL operator __m512i() const { return v; }
-
-            SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
-            SIMD256Impl::Integer v8[2];
-        };
-
-        union Double
-        {
-            SIMDINLINE Double() = default;
-            SIMDINLINE Double(__m512d in) : v(in) {}
-            SIMDINLINE Double(SIMD256Impl::Double const& in_lo,
-                              SIMD256Impl::Double const& in_hi = _mm256_setzero_pd())
-            {
-                v8[0] = in_lo;
-                v8[1] = in_hi;
-            }
-            SIMDINLINE Double& SIMDCALL operator=(__m512d in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Double& SIMDCALL operator=(Double const& in)
-            {
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-                v = in.v;
-#else
-                v8[0] = in.v8[0];
-                v8[1] = in.v8[1];
-#endif
-                return *this;
-            }
-
-            SIMDINLINE SIMDCALL operator __m512d() const { return v; }
-
-            SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
-            SIMD256Impl::Double v8[2];
-        };
-
-        typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
-        using Mask = __mmask16;
-
-        static const uint32_t SIMD_WIDTH = 16;
-
-#undef SIMD_ALIGNMENT_BYTES
-    } // namespace SIMD512Impl
-} // namespace SIMDImpl
diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
deleted file mode 100644
index 0f5382044c2..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#include "common/os.h"
-#include <stdarg.h>
-#include <stdio.h>
-#include <assert.h>
-#include <algorithm>
-#include <mutex>
-
-#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
-
-#if defined(_MSC_VER)
-#pragma comment(lib, "user32.lib")
-#endif // _WIN32
-
-namespace ConsoleUtils
-{
-    enum class TextColor
-    {
-        BLACK = 0,
-#if defined(_WIN32)
-        RED   = 4,
-        GREEN = 2,
-        BLUE  = 1,
-#else
-        RED   = 1,
-        GREEN = 2,
-        BLUE  = 4,
-#endif // _WIN32
-        PURPLE = static_cast<uint32_t>(RED) | static_cast<uint32_t>(BLUE),
-        CYAN   = static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
-        YELLOW = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN),
-        WHITE =
-            static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
-    };
-
-    enum class TextStyle
-    {
-        NORMAL    = 0,
-        INTENSITY = 1,
-    };
-
-    void SetTextColor(FILE*     stream,
-                      TextColor color = TextColor::WHITE,
-                      TextStyle style = TextStyle::NORMAL)
-    {
-#if defined(_WIN32)
-
-        HANDLE hConsoleHandle = nullptr;
-        if (stream == stderr)
-        {
-            hConsoleHandle = GetStdHandle(STD_ERROR_HANDLE);
-        }
-        else if (stream == stdout)
-        {
-            hConsoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
-        }
-        else
-        {
-            // Not a console stream, do nothing
-            return;
-        }
-
-        WORD textAttributes = static_cast<WORD>(color);
-        if (style == TextStyle::INTENSITY)
-        {
-            textAttributes |= FOREGROUND_INTENSITY;
-        }
-        SetConsoleTextAttribute(hConsoleHandle, textAttributes);
-
-#else // !_WIN32
-
-        // Print ANSI codes
-        uint32_t cc =
-            30 + ((style == TextStyle::INTENSITY) ? 60 : 0) + static_cast<uint32_t>(color);
-        fprintf(stream, "\033[0m\033[%d;%dm", static_cast<uint32_t>(style), cc);
-
-#endif
-    }
-
-    void ResetTextColor(FILE* stream)
-    {
-#if defined(_WIN32)
-
-        SetTextColor(stream);
-
-#else // !_WIN32
-
-        // Print ANSI codes
-        fprintf(stream, "\033[0m");
-
-#endif
-    }
-
-    static std::mutex g_stderrMutex;
-} // namespace ConsoleUtils
-
-bool SwrAssert(bool        chkDebugger,
-               bool&       enabled,
-               const char* pExpression,
-               const char* pFileName,
-               uint32_t    lineNum,
-               const char* pFunction,
-               const char* pFmtString,
-               ...)
-{
-    using namespace ConsoleUtils;
-    std::lock_guard<std::mutex> l(g_stderrMutex);
-
-    SetTextColor(stderr, TextColor::CYAN, TextStyle::NORMAL);
-
-    fprintf(stderr, "%s(%d): ", pFileName, lineNum);
-
-    SetTextColor(stderr, TextColor::RED, TextStyle::INTENSITY);
-
-    fprintf(stderr, "ASSERT: %s\n", pExpression);
-
-    SetTextColor(stderr, TextColor::CYAN, TextStyle::INTENSITY);
-    fprintf(stderr, "\t%s\n", pFunction);
-
-    if (pFmtString)
-    {
-        SetTextColor(stderr, TextColor::YELLOW, TextStyle::INTENSITY);
-        fprintf(stderr, "\t");
-        va_list args;
-        va_start(args, pFmtString);
-        vfprintf(stderr, pFmtString, args);
-        va_end(args);
-        fprintf(stderr, "\n");
-    }
-    ResetTextColor(stderr);
-    fflush(stderr);
-
-#if defined(_WIN32)
-    static const int MAX_MESSAGE_LEN = 2048;
-    char             msgBuf[MAX_MESSAGE_LEN];
-
-    sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression);
-    msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
-    msgBuf[MAX_MESSAGE_LEN - 1] = 0;
-    OutputDebugStringA(msgBuf);
-
-    sprintf_s(msgBuf, "\t%s\n", pFunction);
-    msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
-    msgBuf[MAX_MESSAGE_LEN - 1] = 0;
-    OutputDebugStringA(msgBuf);
-
-    int offset = 0;
-
-    if (pFmtString)
-    {
-        va_list args;
-        va_start(args, pFmtString);
-        offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args);
-        va_end(args);
-
-        if (offset < 0)
-        {
-            return true;
-        }
-
-        OutputDebugStringA("\t");
-        OutputDebugStringA(msgBuf);
-        OutputDebugStringA("\n");
-    }
-
-    if (enabled && KNOB_ENABLE_ASSERT_DIALOGS)
-    {
-        int retval = sprintf_s(&msgBuf[offset],
-                               MAX_MESSAGE_LEN - offset,
-                               "\n\n"
-                               "File: %s\n"
-                               "Line: %d\n"
-                               "\n"
-                               "Expression: %s\n\n"
-                               "Cancel: Disable this assert for the remainder of the process\n"
-                               "Try Again: Break into the debugger\n"
-                               "Continue: Continue execution (but leave assert enabled)",
-                               pFileName,
-                               lineNum,
-                               pExpression);
-
-        if (retval < 0)
-        {
-            return true;
-        }
-
-        offset += retval;
-
-        if (!IsDebuggerPresent())
-        {
-            sprintf_s(&msgBuf[offset],
-                      MAX_MESSAGE_LEN - offset,
-                      "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a "
-                      "program crash!");
-        }
-
-        retval = MessageBoxA(nullptr,
-                             msgBuf,
-                             "Assert Failed",
-                             MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION | MB_SETFOREGROUND);
-
-        switch (retval)
-        {
-        case IDCANCEL:
-            enabled = false;
-            return false;
-
-        case IDTRYAGAIN:
-            return true;
-
-        case IDCONTINUE:
-            return false;
-        }
-    }
-    else
-    {
-        return (IsDebuggerPresent() || !chkDebugger) && enabled;
-    }
-#endif // _WIN32
-
-    return enabled;
-}
-
-void SwrTrace(
-    const char* pFileName, uint32_t lineNum, const char* pFunction, const char* pFmtString, ...)
-{
-    using namespace ConsoleUtils;
-    std::lock_guard<std::mutex> l(g_stderrMutex);
-
-    SetTextColor(stderr, TextColor::CYAN, TextStyle::NORMAL);
-
-    fprintf(stderr, "%s(%d): TRACE in %s:\n", pFileName, lineNum, pFunction);
-
-    if (pFmtString)
-    {
-        SetTextColor(stderr, TextColor::PURPLE, TextStyle::INTENSITY);
-        fprintf(stderr, "\t");
-        va_list args;
-        va_start(args, pFmtString);
-        vfprintf(stderr, pFmtString, args);
-        va_end(args);
-        fprintf(stderr, "\n");
-    }
-    ResetTextColor(stderr);
-    fflush(stderr);
-
-#if defined(_WIN32)
-    static const int MAX_MESSAGE_LEN = 2048;
-    char             msgBuf[MAX_MESSAGE_LEN];
-
-    sprintf_s(msgBuf, "%s(%d): TRACE in %s\n", pFileName, lineNum, pFunction);
-    msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
-    msgBuf[MAX_MESSAGE_LEN - 1] = 0;
-    OutputDebugStringA(msgBuf);
-
-    int offset = 0;
-
-    if (pFmtString)
-    {
-        va_list args;
-        va_start(args, pFmtString);
-        offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args);
-        va_end(args);
-
-        if (offset < 0)
-        {
-            return;
-        }
-
-        OutputDebugStringA("\t");
-        OutputDebugStringA(msgBuf);
-        OutputDebugStringA("\n");
-    }
-#endif // _WIN32
-}
-
-#endif // SWR_ENABLE_ASSERTS
diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
deleted file mode 100644
index cd9854f2549..00000000000
--- a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#ifndef __SWR_ASSERT_H__
-#define __SWR_ASSERT_H__
-
-#if !defined(__SWR_OS_H__)
-#error swr_assert.h should not be included directly, please include "common/os.h" instead.
-#endif
-
-//=============================================================================
-//
-// MACROS defined in this file:
-//
-// - SWR_ASSUME(expression, ...):   Tell compiler that the expression is true.
-//                                  Helps with static code analysis as well.
-//                                  DO NOT USE if code after this dynamically
-//                                  checks for errors and handles them.  The
-//                                  compiler may optimize out the error check.
-//
-// - SWR_ASSERT(expression, ...):   Inform the user is expression is false.
-//                                  This check is only conditionally made,
-//                                  usually only in debug mode.
-//
-// - SWR_REL_ASSERT(expression, ...): Unconditionally enabled version of SWR_ASSERT
-//
-// - SWR_ASSUME_ASSERT(expression, ...): Conditionally enabled SWR_ASSERT.  Uses
-//                                       SWR_ASSUME if SWR_ASSERT is disabled.
-//                                       DO NOT USE in combination with actual
-//                                       error checking (see SWR_ASSUME)
-//
-// - SWR_REL_ASSUME_ASSERT(expression, ...): Same as SWR_REL_ASSERT.
-//
-//=============================================================================
-
-// Stupid preprocessor tricks to avoid -Wall / -W4 warnings
-#if defined(_MSC_VER)
-#define _SWR_WARN_DISABLE __pragma(warning(push)) __pragma(warning(disable : 4127))
-#define _SWR_WARN_RESTORE __pragma(warning(pop))
-#else // ! MSVC compiler
-#define _SWR_WARN_DISABLE
-#define _SWR_WARN_RESTORE
-#endif
-
-#define _SWR_MACRO_START \
-    do                   \
-    {
-#define _SWR_MACRO_END \
-    _SWR_WARN_DISABLE  \
-    }                  \
-    while (0)          \
-    _SWR_WARN_RESTORE
-
-#if defined(_MSC_VER)
-#define SWR_ASSUME(e, ...)        \
-    _SWR_MACRO_START __assume(e); \
-    _SWR_MACRO_END
-#elif defined(__clang__)
-#define SWR_ASSUME(e, ...)                \
-    _SWR_MACRO_START __builtin_assume(e); \
-    _SWR_MACRO_END
-#elif defined(__GNUC__)
-#define SWR_ASSUME(e, ...)                                       \
-    _SWR_MACRO_START((e) ? ((void)0) : __builtin_unreachable()); \
-    _SWR_MACRO_END
-#else
-#define SWR_ASSUME(e, ...)      \
-    _SWR_MACRO_START ASSUME(e); \
-    _SWR_MACRO_END
-#endif
-
-#if !defined(SWR_ENABLE_ASSERTS)
-
-#if !defined(NDEBUG)
-#define SWR_ENABLE_ASSERTS 1
-#else
-#define SWR_ENABLE_ASSERTS 0
-#endif // _DEBUG
-
-#endif // SWR_ENABLE_ASSERTS
-
-#if !defined(SWR_ENABLE_REL_ASSERTS)
-#define SWR_ENABLE_REL_ASSERTS 1
-#endif
-
-#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
-#include "assert.h"
-
-#if !defined(__cplusplus)
-
-#pragma message("C++ is required for SWR Asserts, falling back to assert.h")
-
-#if SWR_ENABLE_ASSERTS
-#define SWR_ASSERT(e, ...) assert(e)
-#endif
-
-#if SWR_ENABLE_REL_ASSERTS
-#define SWR_REL_ASSERT(e, ...) assert(e)
-#endif
-
-#else
-
-bool SwrAssert(bool        chkDebugger,
-               bool&       enabled,
-               const char* pExpression,
-               const char* pFileName,
-               uint32_t    lineNum,
-               const char* function,
-               const char* pFmtString = nullptr,
-               ...);
-
-void SwrTrace(
-    const char* pFileName, uint32_t lineNum, const char* function, const char* pFmtString, ...);
-
-#define _SWR_ASSERT(chkDebugger, e, ...)                                                                            \
-    _SWR_MACRO_START                                                                                                \
-    bool expFailed = !(e);                                                                                          \
-    if (expFailed)                                                                                                  \
-    {                                                                                                               \
-        static bool swrAssertEnabled = true;                                                                        \
-        expFailed                    = SwrAssert(                                                                   \
-            chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \
-        if (expFailed)                                                                                              \
-        {                                                                                                           \
-            DEBUGBREAK;                                                                                             \
-        }                                                                                                           \
-    }                                                                                                               \
-    _SWR_MACRO_END
-
-#define _SWR_INVALID(chkDebugger, ...)                                                                     \
-    _SWR_MACRO_START                                                                                       \
-    static bool swrAssertEnabled = true;                                                                   \
-    bool        expFailed        = SwrAssert(                                                              \
-        chkDebugger, swrAssertEnabled, "", __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \
-    if (expFailed)                                                                                         \
-    {                                                                                                      \
-        DEBUGBREAK;                                                                                        \
-    }                                                                                                      \
-    _SWR_MACRO_END
-
-#define _SWR_TRACE(_fmtstr, ...) SwrTrace(__FILE__, __LINE__, __FUNCTION__, _fmtstr, ##__VA_ARGS__);
-
-#if SWR_ENABLE_ASSERTS
-#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__)
-#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSERT(e, ##__VA_ARGS__)
-#define SWR_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
-#endif // SWR_ENABLE_ASSERTS
-
-#if SWR_ENABLE_REL_ASSERTS
-#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__)
-#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_REL_ASSERT(e, ##__VA_ARGS__)
-#define SWR_REL_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
-
-// SWR_INVALID is always enabled
-// Funky handling to allow 0 arguments with g++/gcc
-// This is needed because you can't "swallow commas" with ##_VA_ARGS__ unless
-// there is a first argument to the macro.  So having a macro that can optionally
-// accept 0 arguments is tricky.
-#define _SWR_INVALID_0() _SWR_INVALID(false)
-#define _SWR_INVALID_1(...) _SWR_INVALID(false, ##__VA_ARGS__)
-#define _SWR_INVALID_VARGS_(_10, _9, _8, _7, _6, _5, _4, _3, _2, _1, N, ...) N
-#define _SWR_INVALID_VARGS(...) _SWR_INVALID_VARGS_(__VA_ARGS__, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1)
-#define _SWR_INVALID_VARGS_0() 1, 2, 3, 4, 5, 6, 7, 9, 9, 10
-#define _SWR_INVALID_CONCAT_(a, b) a##b
-#define _SWR_INVALID_CONCAT(a, b) _SWR_INVALID_CONCAT_(a, b)
-#define SWR_INVALID(...)                                                                       \
-    _SWR_INVALID_CONCAT(_SWR_INVALID_, _SWR_INVALID_VARGS(_SWR_INVALID_VARGS_0 __VA_ARGS__())) \
-    (__VA_ARGS__)
-
-#define SWR_STATIC_ASSERT(expression, ...) \
-    static_assert((expression), "Failed:\n    " #expression "\n    " __VA_ARGS__);
-
-#endif // SWR_ENABLE_REL_ASSERTS
-
-#endif // C++
-
-#endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
-
-// Needed to allow passing bitfield members to sizeof() in disabled asserts
-template <typename T>
-static bool SwrSizeofWorkaround(T)
-{
-    return false;
-}
-
-#if !SWR_ENABLE_ASSERTS
-#define SWR_ASSERT(e, ...)                                 \
-    _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
-    _SWR_MACRO_END
-#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__)
-#define SWR_TRACE(_fmtstr, ...) \
-    _SWR_MACRO_START(void)(0);  \
-    _SWR_MACRO_END
-#endif
-
-#if !SWR_ENABLE_REL_ASSERTS
-#define SWR_REL_ASSERT(e, ...)                             \
-    _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
-    _SWR_MACRO_END
-#define SWR_INVALID(...)       \
-    _SWR_MACRO_START(void)(0); \
-    _SWR_MACRO_END
-#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__)
-#define SWR_REL_TRACE(_fmtstr, ...) \
-    _SWR_MACRO_START(void)(0);      \
-    _SWR_MACRO_END
-#define SWR_STATIC_ASSERT(e, ...)                           \
-    _SWR_MACRO_START(void)  sizeof(SwrSizeofWorkaround(e)); \
-    _SWR_MACRO_END
-#endif
-
-#if defined(_MSC_VER)
-#define SWR_FUNCTION_DECL __FUNCSIG__
-#elif (defined(__GNUC__) || defined(__clang__))
-#define SWR_FUNCTION_DECL __PRETTY_FUNCTION__
-#else
-#define SWR_FUNCTION_DECL __FUNCTION__
-#endif
-
-#define SWR_NOT_IMPL SWR_INVALID("%s not implemented", SWR_FUNCTION_DECL)
-
-#endif //__SWR_ASSERT_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
deleted file mode 100644
index bee257d7723..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ /dev/null
@@ -1,1802 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file api.cpp
- *
- * @brief API implementation
- *
- ******************************************************************************/
-
-#include <cfloat>
-#include <cmath>
-#include <cstdio>
-#include <new>
-
-#include "core/api.h"
-#include "core/backend.h"
-#include "core/context.h"
-#include "core/depthstencil.h"
-#include "core/frontend.h"
-#include "core/rasterizer.h"
-#include "core/rdtsc_core.h"
-#include "core/threads.h"
-#include "core/tilemgr.h"
-#include "core/clip.h"
-#include "core/utils.h"
-#include "core/tileset.h"
-
-#include "common/os.h"
-
-static const SWR_RECT g_MaxScissorRect = {0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y};
-
-void SetupDefaultState(SWR_CONTEXT* pContext);
-
-static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
-{
-    return (SWR_CONTEXT*)hContext;
-}
-
-void WakeAllThreads(SWR_CONTEXT* pContext)
-{
-    pContext->FifosNotEmpty.notify_all();
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Create SWR Context.
-/// @param pCreateInfo - pointer to creation info.
-HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
-{
-    void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
-    memset(pContextMem, 0, sizeof(SWR_CONTEXT));
-    SWR_CONTEXT* pContext = new (pContextMem) SWR_CONTEXT();
-
-    pContext->privateStateSize = pCreateInfo->privateStateSize;
-
-    // initialize callback functions
-    pContext->pfnLoadTile                = pCreateInfo->pfnLoadTile;
-    pContext->pfnStoreTile               = pCreateInfo->pfnStoreTile;
-    pContext->pfnTranslateGfxptrForRead  = pCreateInfo->pfnTranslateGfxptrForRead;
-    pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite;
-    pContext->pfnMakeGfxPtr              = pCreateInfo->pfnMakeGfxPtr;
-    pContext->pfnCreateMemoryContext     = pCreateInfo->pfnCreateMemoryContext;
-    pContext->pfnDestroyMemoryContext    = pCreateInfo->pfnDestroyMemoryContext;
-    pContext->pfnUpdateSoWriteOffset     = pCreateInfo->pfnUpdateSoWriteOffset;
-    pContext->pfnUpdateStats             = pCreateInfo->pfnUpdateStats;
-    pContext->pfnUpdateStatsFE           = pCreateInfo->pfnUpdateStatsFE;
-    pContext->pfnUpdateStreamOut         = pCreateInfo->pfnUpdateStreamOut;
-
-
-    pContext->hExternalMemory = pCreateInfo->hExternalMemory;
-
-    pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
-    if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
-    {
-        pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT;
-    }
-
-    pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
-    pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
-
-    pContext->pMacroTileManagerArray =
-        (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
-    pContext->pDispatchQueueArray =
-        (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
-
-    for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
-    {
-        pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
-        new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
-        new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
-
-        pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
-    }
-
-    if (pCreateInfo->pThreadInfo)
-    {
-        pContext->threadInfo = *pCreateInfo->pThreadInfo;
-    }
-    else
-    {
-        pContext->threadInfo.MAX_WORKER_THREADS      = KNOB_MAX_WORKER_THREADS;
-        pContext->threadInfo.BASE_NUMA_NODE          = KNOB_BASE_NUMA_NODE;
-        pContext->threadInfo.BASE_CORE               = KNOB_BASE_CORE;
-        pContext->threadInfo.BASE_THREAD             = KNOB_BASE_THREAD;
-        pContext->threadInfo.MAX_NUMA_NODES          = KNOB_MAX_NUMA_NODES;
-        pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
-        pContext->threadInfo.MAX_THREADS_PER_CORE    = KNOB_MAX_THREADS_PER_CORE;
-        pContext->threadInfo.SINGLE_THREADED         = KNOB_SINGLE_THREADED;
-    }
-
-    if (pCreateInfo->pApiThreadInfo)
-    {
-        pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
-    }
-    else
-    {
-        pContext->apiThreadInfo.bindAPIThread0        = true;
-        pContext->apiThreadInfo.numAPIReservedThreads = 1;
-        pContext->apiThreadInfo.numAPIThreadsPerCore  = 1;
-    }
-
-    if (pCreateInfo->pWorkerPrivateState)
-    {
-        pContext->workerPrivateState = *pCreateInfo->pWorkerPrivateState;
-    }
-
-    memset((void*)&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
-    memset((void*)&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
-    new (&pContext->WaitLock) std::mutex();
-    new (&pContext->FifosNotEmpty) std::condition_variable();
-
-    CreateThreadPool(pContext, &pContext->threadPool);
-
-    if (pContext->apiThreadInfo.bindAPIThread0)
-    {
-        BindApiThread(pContext, 0);
-    }
-
-    if (pContext->threadInfo.SINGLE_THREADED)
-    {
-        pContext->pSingleThreadLockedTiles = new TileSet();
-    }
-
-    pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
-    pContext->pStats =
-        (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
-
-#if defined(KNOB_ENABLE_AR)
-    // Setup ArchRast thread contexts which includes +1 for API thread.
-    pContext->pArContext = new HANDLE[pContext->NumWorkerThreads + 1];
-    pContext->pArContext[pContext->NumWorkerThreads] =
-        ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API);
-#endif
-
-#if defined(KNOB_ENABLE_RDTSC)
-    pContext->pBucketMgr = new BucketManager(pCreateInfo->contextName);
-    RDTSC_RESET(pContext->pBucketMgr);
-    RDTSC_INIT(pContext->pBucketMgr, 0);
-#endif
-
-    // Allocate scratch space for workers.
-    ///@note We could lazily allocate this but its rather small amount of memory.
-    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
-    {
-#if defined(_WIN32)
-        uint32_t numaNode =
-            pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0;
-        pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(GetCurrentProcess(),
-                                                              nullptr,
-                                                              KNOB_WORKER_SCRATCH_SPACE_SIZE,
-                                                              MEM_RESERVE | MEM_COMMIT,
-                                                              PAGE_READWRITE,
-                                                              numaNode);
-#else
-        pContext->ppScratch[i] =
-            (uint8_t*)AlignedMalloc(KNOB_WORKER_SCRATCH_SPACE_SIZE, KNOB_SIMD_WIDTH * 4);
-#endif
-
-#if defined(KNOB_ENABLE_AR)
-        // Initialize worker thread context for ArchRast.
-        pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
-
-        SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData;
-        pWorkerData->hArContext = pContext->pArContext[i];
-#endif
-
-
-    }
-
-#if defined(KNOB_ENABLE_AR)
-    // cache the API thread event manager, for use with sim layer
-    pCreateInfo->hArEventManager = pContext->pArContext[pContext->NumWorkerThreads];
-#endif
-
-    // State setup AFTER context is fully initialized
-    SetupDefaultState(pContext);
-
-    // initialize hot tile manager
-    pContext->pHotTileMgr = new HotTileMgr();
-
-    // pass pointer to bucket manager back to caller
-#ifdef KNOB_ENABLE_RDTSC
-    pCreateInfo->pBucketMgr = pContext->pBucketMgr;
-#endif
-
-    pCreateInfo->contextSaveSize = sizeof(API_STATE);
-
-    StartThreadPool(pContext, &pContext->threadPool);
-
-    return (HANDLE)pContext;
-}
-
-void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
-{
-    memcpy((void*)&dst.state, (void*)&src.state, sizeof(API_STATE));
-}
-
-template <bool IsDraw>
-void QueueWork(SWR_CONTEXT* pContext)
-{
-    DRAW_CONTEXT* pDC     = pContext->pCurDrawContext;
-    uint32_t      dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
-
-    if (IsDraw)
-    {
-        pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
-        pDC->pTileMgr->initialize();
-    }
-
-    // Each worker thread looks at a DC for both FE and BE work at different times and so we
-    // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
-    // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
-    // then moved on if all work is done.)
-    pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
-
-    if (IsDraw)
-    {
-        InterlockedIncrement(&pContext->drawsOutstandingFE);
-    }
-
-    _ReadWriteBarrier();
-    {
-        std::unique_lock<std::mutex> lock(pContext->WaitLock);
-        pContext->dcRing.Enqueue();
-    }
-
-    if (pContext->threadInfo.SINGLE_THREADED)
-    {
-        uint32_t mxcsr = SetOptimalVectorCSR();
-
-        if (IsDraw)
-        {
-            uint32_t curDraw[2] = {pContext->pCurDrawContext->drawId,
-                                   pContext->pCurDrawContext->drawId};
-            WorkOnFifoFE(pContext, 0, curDraw[0]);
-            WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0);
-        }
-        else
-        {
-            uint32_t curDispatch = pContext->pCurDrawContext->drawId;
-            WorkOnCompute(pContext, 0, curDispatch);
-        }
-
-        // Dequeue the work here, if not already done, since we're single threaded (i.e. no
-        // workers).
-        while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0)
-        {
-        }
-
-        // restore csr
-        RestoreVectorCSR(mxcsr);
-    }
-    else
-    {
-        RDTSC_BEGIN(pContext->pBucketMgr, APIDrawWakeAllThreads, pDC->drawId);
-        WakeAllThreads(pContext);
-        RDTSC_END(pContext->pBucketMgr, APIDrawWakeAllThreads, 1);
-    }
-
-    // Set current draw context to NULL so that next state call forces a new draw context to be
-    // created and populated.
-    pContext->pPrevDrawContext = pContext->pCurDrawContext;
-    pContext->pCurDrawContext  = nullptr;
-}
-
-INLINE void QueueDraw(SWR_CONTEXT* pContext)
-{
-    QueueWork<true>(pContext);
-}
-
-INLINE void QueueDispatch(SWR_CONTEXT* pContext)
-{
-    QueueWork<false>(pContext);
-}
-
-DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT* pContext, bool isSplitDraw = false)
-{
-    RDTSC_BEGIN(pContext->pBucketMgr, APIGetDrawContext, 0);
-    // If current draw context is null then need to obtain a new draw context to use from ring.
-    if (pContext->pCurDrawContext == nullptr)
-    {
-        // Need to wait for a free entry.
-        while (pContext->dcRing.IsFull())
-        {
-            _mm_pause();
-        }
-
-        uint64_t curDraw = pContext->dcRing.GetHead();
-        uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
-
-        if ((pContext->frameCount - pContext->lastFrameChecked) > 2 ||
-            (curDraw - pContext->lastDrawChecked) > 0x10000)
-        {
-            // Take this opportunity to clean-up old arena allocations
-            pContext->cachingArenaAllocator.FreeOldBlocks();
-
-            pContext->lastFrameChecked = pContext->frameCount;
-            pContext->lastDrawChecked  = curDraw;
-        }
-
-        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
-        pContext->pCurDrawContext     = pCurDrawContext;
-
-        // Assign next available entry in DS ring to this DC.
-        uint32_t dsIndex        = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT;
-        pCurDrawContext->pState = &pContext->dsRing[dsIndex];
-
-        // Copy previous state to current state.
-        if (pContext->pPrevDrawContext)
-        {
-            DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
-
-            // If we're splitting our draw then we can just use the same state from the previous
-            // draw. In this case, we won't increment the DS ring index so the next non-split
-            // draw can receive the state.
-            if (isSplitDraw == false)
-            {
-                CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
-
-                // Should have been cleaned up previously
-                SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
-
-                pCurDrawContext->pState->pPrivateState = nullptr;
-
-                pContext->curStateId++; // Progress state ring index forward.
-            }
-            else
-            {
-                // If its a split draw then just copy the state pointer over
-                // since its the same draw.
-                pCurDrawContext->pState = pPrevDrawContext->pState;
-                SWR_ASSERT(pPrevDrawContext->cleanupState == false);
-            }
-        }
-        else
-        {
-            SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
-            pContext->curStateId++; // Progress state ring index forward.
-        }
-
-        SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
-
-        // Reset dependency
-        pCurDrawContext->dependent   = false;
-        pCurDrawContext->dependentFE = false;
-
-        pCurDrawContext->pContext  = pContext;
-        pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
-
-        pCurDrawContext->doneFE                         = false;
-        pCurDrawContext->FeLock                         = 0;
-        pCurDrawContext->threadsDone                    = 0;
-        pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
-
-        pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads);
-
-        // Assign unique drawId for this DC
-        pCurDrawContext->drawId = pContext->dcRing.GetHead();
-
-        pCurDrawContext->cleanupState = true;
-    }
-    else
-    {
-        SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
-    }
-
-    RDTSC_END(pContext->pBucketMgr, APIGetDrawContext, 0);
-    return pContext->pCurDrawContext;
-}
-
-API_STATE* GetDrawState(SWR_CONTEXT* pContext)
-{
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-    SWR_ASSERT(pDC->pState != nullptr);
-
-    return &pDC->pState->state;
-}
-
-void SwrDestroyContext(HANDLE hContext)
-{
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    pDC->FeWork.type    = SHUTDOWN;
-    pDC->FeWork.pfnWork = ProcessShutdown;
-
-    // enqueue
-    QueueDraw(pContext);
-
-    DestroyThreadPool(pContext, &pContext->threadPool);
-
-    // free the fifos
-    for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i)
-    {
-        AlignedFree(pContext->dcRing[i].dynState.pStats);
-        delete pContext->dcRing[i].pArena;
-        delete pContext->dsRing[i].pArena;
-        pContext->pMacroTileManagerArray[i].~MacroTileMgr();
-        pContext->pDispatchQueueArray[i].~DispatchQueue();
-    }
-
-    AlignedFree(pContext->pDispatchQueueArray);
-    AlignedFree(pContext->pMacroTileManagerArray);
-
-    // Free scratch space.
-    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
-    {
-#if defined(_WIN32)
-        VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
-#else
-        AlignedFree(pContext->ppScratch[i]);
-#endif
-
-#if defined(KNOB_ENABLE_AR)
-        ArchRast::DestroyThreadContext(pContext->pArContext[i]);
-#endif
-    }
-
-#if defined(KNOB_ENABLE_RDTSC)
-    delete pContext->pBucketMgr;
-#endif
-
-    delete[] pContext->ppScratch;
-    AlignedFree(pContext->pStats);
-
-    delete pContext->pHotTileMgr;
-    delete pContext->pSingleThreadLockedTiles;
-
-    pContext->~SWR_CONTEXT();
-    AlignedFree(GetContext(hContext));
-}
-
-void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
-{
-    SWR_CONTEXT* pContext = GetContext(hContext);
-    BindApiThread(pContext, apiThreadId);
-}
-
-void SWR_API SwrSaveState(HANDLE hContext, void* pOutputStateBlock, size_t memSize)
-{
-    SWR_CONTEXT* pContext = GetContext(hContext);
-    auto         pSrc     = GetDrawState(pContext);
-    assert(pOutputStateBlock && memSize >= sizeof(*pSrc));
-
-    memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
-}
-
-void SWR_API SwrRestoreState(HANDLE hContext, const void* pStateBlock, size_t memSize)
-{
-    SWR_CONTEXT* pContext = GetContext(hContext);
-    auto         pDst     = GetDrawState(pContext);
-    assert(pStateBlock && memSize >= sizeof(*pDst));
-
-    memcpy((void*)pDst, (void*)pStateBlock, sizeof(*pDst));
-}
-
-void SetupDefaultState(SWR_CONTEXT* pContext)
-{
-    API_STATE* pState = GetDrawState(pContext);
-
-    pState->rastState.cullMode     = SWR_CULLMODE_NONE;
-    pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
-
-    pState->depthBoundsState.depthBoundsTestEnable   = false;
-    pState->depthBoundsState.depthBoundsTestMinValue = 0.0f;
-    pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f;
-}
-
-void SWR_API SwrSync(HANDLE            hContext,
-                     PFN_CALLBACK_FUNC pfnFunc,
-                     uint64_t          userData,
-                     uint64_t          userData2,
-                     uint64_t          userData3)
-{
-    SWR_ASSERT(pfnFunc != nullptr);
-
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    RDTSC_BEGIN(pContext->pBucketMgr, APISync, 0);
-
-    pDC->FeWork.type    = SYNC;
-    pDC->FeWork.pfnWork = ProcessSync;
-
-    // Setup callback function
-    pDC->retireCallback.pfnCallbackFunc = pfnFunc;
-    pDC->retireCallback.userData        = userData;
-    pDC->retireCallback.userData2       = userData2;
-    pDC->retireCallback.userData3       = userData3;
-
-    AR_API_EVENT(SwrSyncEvent(pDC->drawId));
-
-    // enqueue
-    QueueDraw(pContext);
-
-    RDTSC_END(pContext->pBucketMgr, APISync, 1);
-}
-
-void SwrStallBE(HANDLE hContext)
-{
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    pDC->dependent = true;
-}
-
-void SwrWaitForIdle(HANDLE hContext)
-{
-    SWR_CONTEXT* pContext = GetContext(hContext);
-
-    RDTSC_BEGIN(pContext->pBucketMgr, APIWaitForIdle, 0);
-
-    while (!pContext->dcRing.IsEmpty())
-    {
-        _mm_pause();
-    }
-
-    RDTSC_END(pContext->pBucketMgr, APIWaitForIdle, 1);
-}
-
-void SwrWaitForIdleFE(HANDLE hContext)
-{
-    SWR_CONTEXT* pContext = GetContext(hContext);
-
-    RDTSC_BEGIN(pContext->pBucketMgr, APIWaitForIdle, 0);
-
-    while (pContext->drawsOutstandingFE > 0)
-    {
-        _mm_pause();
-    }
-
-    RDTSC_END(pContext->pBucketMgr, APIWaitForIdle, 1);
-}
-
-void SwrSetVertexBuffers(HANDLE                         hContext,
-                         uint32_t                       numBuffers,
-                         const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-
-    for (uint32_t i = 0; i < numBuffers; ++i)
-    {
-        const SWR_VERTEX_BUFFER_STATE* pVB = &pVertexBuffers[i];
-        pState->vertexBuffers[pVB->index]  = *pVB;
-    }
-}
-
-void SwrSetIndexBuffer(HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-
-    pState->indexBuffer = *pIndexBuffer;
-}
-
-void SwrSetFetchFunc(HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-
-    pState->pfnFetchFunc = pfnFetchFunc;
-}
-
-void SwrSetSoFunc(HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-
-    SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
-
-    pState->pfnSoFunc[streamIndex] = pfnSoFunc;
-}
-
-void SwrSetSoState(HANDLE hContext, SWR_STREAMOUT_STATE* pSoState)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-
-    pState->soState = *pSoState;
-}
-
-void SwrSetSoBuffers(HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-
-    SWR_ASSERT((slot < MAX_SO_STREAMS), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
-
-    // remember buffer status in case of future resume StreamOut
-    if ((pState->soBuffer[slot].pBuffer != 0) && (pSoBuffer->pBuffer == 0))
-	pState->soPausedBuffer[slot] = pState->soBuffer[slot];
-
-    // resume
-    if (pState->soPausedBuffer[slot].pBuffer == pSoBuffer->pBuffer)
-	pState->soBuffer[slot] = pState->soPausedBuffer[slot];
-    else
-        pState->soBuffer[slot] = *pSoBuffer;
-}
-
-void SwrSetVertexFunc(HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-
-    pState->pfnVertexFunc = pfnVertexFunc;
-}
-
-void SwrSetFrontendState(HANDLE hContext, SWR_FRONTEND_STATE* pFEState)
-{
-    API_STATE* pState     = GetDrawState(GetContext(hContext));
-    pState->frontendState = *pFEState;
-}
-
-void SwrSetGsState(HANDLE hContext, SWR_GS_STATE* pGSState)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-    pState->gsState   = *pGSState;
-}
-
-void SwrSetGsFunc(HANDLE hContext, PFN_GS_FUNC pfnGsFunc)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-    pState->pfnGsFunc = pfnGsFunc;
-}
-
-void SwrSetCsFunc(HANDLE      hContext,
-                  PFN_CS_FUNC pfnCsFunc,
-                  uint32_t    totalThreadsInGroup,
-                  uint32_t    totalSpillFillSize,
-                  uint32_t    scratchSpaceSizePerWarp,
-                  uint32_t    numWarps)
-{
-    API_STATE* pState               = GetDrawState(GetContext(hContext));
-    pState->pfnCsFunc               = pfnCsFunc;
-    pState->totalThreadsInGroup     = totalThreadsInGroup;
-    pState->totalSpillFillSize      = totalSpillFillSize;
-    pState->scratchSpaceSizePerWarp = scratchSpaceSizePerWarp;
-    pState->scratchSpaceNumWarps    = numWarps;
-}
-
-void SwrSetTsState(HANDLE hContext, SWR_TS_STATE* pState)
-{
-    API_STATE* pApiState = GetDrawState(GetContext(hContext));
-    pApiState->tsState   = *pState;
-}
-
-void SwrSetHsFunc(HANDLE hContext, PFN_HS_FUNC pfnFunc)
-{
-    API_STATE* pApiState = GetDrawState(GetContext(hContext));
-    pApiState->pfnHsFunc = pfnFunc;
-}
-
-void SwrSetDsFunc(HANDLE hContext, PFN_DS_FUNC pfnFunc)
-{
-    API_STATE* pApiState = GetDrawState(GetContext(hContext));
-    pApiState->pfnDsFunc = pfnFunc;
-}
-
-void SwrSetDepthStencilState(HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pDSState)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-
-    pState->depthStencilState = *pDSState;
-}
-
-void SwrSetBackendState(HANDLE hContext, SWR_BACKEND_STATE* pBEState)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-
-    pState->backendState = *pBEState;
-}
-
-void SwrSetDepthBoundsState(HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pDBState)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-
-    pState->depthBoundsState = *pDBState;
-}
-
-void SwrSetPixelShaderState(HANDLE hContext, SWR_PS_STATE* pPSState)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-    pState->psState   = *pPSState;
-}
-
-void SwrSetBlendState(HANDLE hContext, SWR_BLEND_STATE* pBlendState)
-{
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-    memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
-}
-
-void SwrSetBlendFunc(HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc)
-{
-    SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
-    API_STATE* pState                  = GetDrawState(GetContext(hContext));
-    pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
-}
-
-// update guardband multipliers for the viewport
-void updateGuardbands(API_STATE* pState)
-{
-    uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
-
-    for (uint32_t i = 0; i < numGbs; ++i)
-    {
-        // guardband center is viewport center
-        pState->gbState.left[i]   = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
-        pState->gbState.right[i]  = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
-        pState->gbState.top[i]    = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
-        pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
-    }
-}
-
-void SwrSetRastState(HANDLE hContext, const SWR_RASTSTATE* pRastState)
-{
-    SWR_CONTEXT* pContext = GetContext(hContext);
-    API_STATE*   pState   = GetDrawState(pContext);
-
-    memcpy((void*)&pState->rastState, (void*)pRastState, sizeof(SWR_RASTSTATE));
-}
-
-void SwrSetViewports(HANDLE                       hContext,
-                     uint32_t                     numViewports,
-                     const SWR_VIEWPORT*          pViewports,
-                     const SWR_VIEWPORT_MATRICES* pMatrices)
-{
-    SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of viewports.");
-
-    SWR_CONTEXT* pContext = GetContext(hContext);
-    API_STATE*   pState   = GetDrawState(pContext);
-
-    memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
-    // @todo Faster to copy portions of the SOA or just copy all of it?
-    memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
-}
-
-void SwrSetScissorRects(HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors)
-{
-    SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of scissor rects.");
-
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-    memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0]));
-};
-
-void SetupMacroTileScissors(DRAW_CONTEXT* pDC)
-{
-    API_STATE* pState = &pDC->pState->state;
-    uint32_t numScissors =
-        pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
-    pState->scissorsTileAligned = true;
-
-    for (uint32_t index = 0; index < numScissors; ++index)
-    {
-        SWR_RECT& scissorInFixedPoint = pState->scissorsInFixedPoint[index];
-
-        // Set up scissor dimensions based on scissor or viewport
-        if (pState->rastState.scissorEnable)
-        {
-            scissorInFixedPoint = pState->scissorRects[index];
-        }
-        else
-        {
-            // the vp width and height must be added to origin un-rounded then the result round to
-            // -inf. The cast to int works for rounding assuming all [left, right, top, bottom] are
-            // positive.
-            scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
-            scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
-            scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
-            scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height);
-        }
-
-        // Clamp to max rect
-        scissorInFixedPoint &= g_MaxScissorRect;
-
-        // Test for tile alignment
-        bool tileAligned;
-        tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
-        tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
-        tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
-        tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0;
-
-        pState->scissorsTileAligned &= tileAligned;
-
-        // Scale to fixed point
-        scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
-        scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
-        scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
-        scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
-
-        // Make scissor inclusive
-        scissorInFixedPoint.xmax -= 1;
-        scissorInFixedPoint.ymax -= 1;
-    }
-}
-
-
-// templated backend function tables
-
-void SetupPipeline(DRAW_CONTEXT* pDC)
-{
-    DRAW_STATE*          pState       = pDC->pState;
-    const SWR_RASTSTATE& rastState    = pState->state.rastState;
-    const SWR_PS_STATE&  psState      = pState->state.psState;
-    BACKEND_FUNCS&       backendFuncs = pState->backendFuncs;
-
-    // setup backend
-    if (psState.pfnPixelShader == nullptr)
-    {
-        backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
-    }
-    else
-    {
-        const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
-        const bool     bMultisampleEnable =
-            ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0;
-        const uint32_t centroid =
-            ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
-        const uint32_t canEarlyZ =
-            (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0;
-        SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
-
-        // select backend function
-        switch (psState.shadingRate)
-        {
-        case SWR_SHADING_RATE_PIXEL:
-            if (bMultisampleEnable)
-            {
-                // always need to generate I & J per sample for Z interpolation
-                barycentricsMask =
-                    (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-                backendFuncs.pfnBackend =
-                    gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern]
-                                          [psState.inputCoverage][centroid][forcedSampleCount]
-                                          [canEarlyZ]
-                    ;
-            }
-            else
-            {
-                // always need to generate I & J per pixel for Z interpolation
-                barycentricsMask =
-                    (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
-                backendFuncs.pfnBackend =
-                    gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
-            }
-            break;
-        case SWR_SHADING_RATE_SAMPLE:
-            SWR_ASSERT(rastState.bIsCenterPattern != true);
-            // always need to generate I & J per sample for Z interpolation
-            barycentricsMask =
-                (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-            backendFuncs.pfnBackend =
-                gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid]
-                                       [canEarlyZ];
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid shading rate");
-            break;
-        }
-    }
-
-    SWR_ASSERT(backendFuncs.pfnBackend);
-
-    PFN_PROCESS_PRIMS pfnBinner;
-#if USE_SIMD16_FRONTEND
-    PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
-#endif
-    switch (pState->state.topology)
-    {
-    case TOP_POINT_LIST:
-        pState->pfnProcessPrims = ClipPoints;
-        pfnBinner               = BinPoints;
-#if USE_SIMD16_FRONTEND
-        pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
-        pfnBinner_simd16               = BinPoints_simd16;
-#endif
-        break;
-    case TOP_LINE_LIST:
-    case TOP_LINE_STRIP:
-    case TOP_LINE_LOOP:
-    case TOP_LINE_LIST_ADJ:
-    case TOP_LISTSTRIP_ADJ:
-        pState->pfnProcessPrims = ClipLines;
-        pfnBinner               = BinLines;
-#if USE_SIMD16_FRONTEND
-        pState->pfnProcessPrims_simd16 = ClipLines_simd16;
-        pfnBinner_simd16               = BinLines_simd16;
-#endif
-        break;
-    default:
-        pState->pfnProcessPrims = ClipTriangles;
-        pfnBinner               = GetBinTrianglesFunc((rastState.conservativeRast > 0));
-#if USE_SIMD16_FRONTEND
-        pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
-        pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
-#endif
-        break;
-    };
-
-
-    // Disable clipper if viewport transform is disabled or if clipper is disabled
-    if (pState->state.frontendState.vpTransformDisable || !pState->state.rastState.clipEnable)
-    {
-        pState->pfnProcessPrims = pfnBinner;
-#if USE_SIMD16_FRONTEND
-        pState->pfnProcessPrims_simd16 = pfnBinner_simd16;
-#endif
-    }
-
-    // Disable rasterizer and backend if no pixel, no depth/stencil, and no attributes
-    if ((pState->state.psState.pfnPixelShader == nullptr) &&
-        (pState->state.depthStencilState.depthTestEnable == FALSE) &&
-        (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
-        (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
-        (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
-        (pState->state.backendState.numAttributes == 0))
-    {
-        pState->pfnProcessPrims = nullptr;
-#if USE_SIMD16_FRONTEND
-        pState->pfnProcessPrims_simd16 = nullptr;
-#endif
-    }
-
-    if (pState->state.soState.rasterizerDisable == true)
-    {
-        pState->pfnProcessPrims = nullptr;
-#if USE_SIMD16_FRONTEND
-        pState->pfnProcessPrims_simd16 = nullptr;
-#endif
-    }
-
-
-    // set up the frontend attribute count
-    pState->state.feNumAttributes         = 0;
-    const SWR_BACKEND_STATE& backendState = pState->state.backendState;
-    if (backendState.swizzleEnable)
-    {
-        // attribute swizzling is enabled, iterate over the map and record the max attribute used
-        for (uint32_t i = 0; i < backendState.numAttributes; ++i)
-        {
-            pState->state.feNumAttributes =
-                std::max(pState->state.feNumAttributes,
-                         (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1);
-        }
-    }
-    else
-    {
-        pState->state.feNumAttributes = pState->state.backendState.numAttributes;
-    }
-
-    if (pState->state.soState.soEnable)
-    {
-        uint64_t streamMasks = 0;
-        for (uint32_t i = 0; i < 4; ++i)
-        {
-            streamMasks |= pState->state.soState.streamMasks[i];
-        }
-
-        unsigned long maxAttrib;
-        if (_BitScanReverse64(&maxAttrib, streamMasks))
-        {
-            pState->state.feNumAttributes =
-                std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1));
-        }
-    }
-
-    // complicated logic to test for cases where we don't need backing hottile memory for a draw
-    // have to check for the special case where depth/stencil test is enabled but depthwrite is
-    // disabled.
-    pState->state.depthHottileEnable =
-        ((!(pState->state.depthStencilState.depthTestEnable &&
-            !pState->state.depthStencilState.depthWriteEnable &&
-            !pState->state.depthBoundsState.depthBoundsTestEnable &&
-            pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
-         (pState->state.depthStencilState.depthTestEnable ||
-          pState->state.depthStencilState.depthWriteEnable ||
-          pState->state.depthBoundsState.depthBoundsTestEnable))
-            ? true
-            : false;
-
-    pState->state.stencilHottileEnable =
-        (((!(pState->state.depthStencilState.stencilTestEnable &&
-             !pState->state.depthStencilState.stencilWriteEnable &&
-             pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
-          // for stencil we have to check the double sided state as well
-          (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
-             !pState->state.depthStencilState.stencilWriteEnable &&
-             pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
-         (pState->state.depthStencilState.stencilTestEnable ||
-          pState->state.depthStencilState.stencilWriteEnable))
-            ? true
-            : false;
-
-    uint32_t hotTileEnable = pState->state.psState.renderTargetMask;
-
-    // Disable hottile for surfaces with no writes
-    if (psState.pfnPixelShader != nullptr)
-    {
-        unsigned long rt;
-        uint32_t rtMask = pState->state.psState.renderTargetMask;
-        while (_BitScanForward(&rt, rtMask))
-        {
-            rtMask &= ~(1 << rt);
-
-            if (pState->state.blendState.renderTarget[rt].writeDisableAlpha &&
-                pState->state.blendState.renderTarget[rt].writeDisableRed &&
-                pState->state.blendState.renderTarget[rt].writeDisableGreen &&
-                pState->state.blendState.renderTarget[rt].writeDisableBlue)
-            {
-                hotTileEnable &= ~(1 << rt);
-            }
-        }
-    }
-
-    pState->state.colorHottileEnable = hotTileEnable;
-
-    // Setup depth quantization function
-    if (pState->state.depthHottileEnable)
-    {
-        switch (pState->state.rastState.depthFormat)
-        {
-        case R32_FLOAT_X8X24_TYPELESS:
-            pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT_X8X24_TYPELESS>;
-            break;
-        case R32_FLOAT:
-            pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
-            break;
-        case R24_UNORM_X8_TYPELESS:
-            pState->state.pfnQuantizeDepth = QuantizeDepth<R24_UNORM_X8_TYPELESS>;
-            break;
-        case R16_UNORM:
-            pState->state.pfnQuantizeDepth = QuantizeDepth<R16_UNORM>;
-            break;
-        default:
-            SWR_INVALID("Unsupported depth format for depth quantization.");
-            pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
-        }
-    }
-    else
-    {
-        // set up pass-through quantize if depth isn't enabled
-        pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
-    }
-
-    // Generate guardbands
-    updateGuardbands(&pState->state);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief InitDraw
-/// @param pDC - Draw context to initialize for this draw.
-void InitDraw(DRAW_CONTEXT* pDC, bool isSplitDraw)
-{
-    // We don't need to re-setup the scissors/pipeline state again for split draw.
-    if (isSplitDraw == false)
-    {
-        SetupMacroTileScissors(pDC);
-        SetupPipeline(pDC);
-    }
-
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief We can split the draw for certain topologies for better performance.
-/// @param totalVerts - Total vertices for draw
-/// @param topology - Topology used for draw
-uint32_t MaxVertsPerDraw(DRAW_CONTEXT* pDC, uint32_t totalVerts, PRIMITIVE_TOPOLOGY topology)
-{
-    API_STATE& state = pDC->pState->state;
-
-    // We can not split draws that have streamout enabled because there is no practical way
-    // to support multiple threads generating SO data for a single set of buffers.
-    if (state.soState.soEnable)
-    {
-        return totalVerts;
-    }
-
-    // The Primitive Assembly code can only handle 1 RECT at a time. Specified with only 3 verts.
-    if (topology == TOP_RECT_LIST)
-    {
-        return 3;
-    }
-
-    // Is split drawing disabled?
-    if (KNOB_DISABLE_SPLIT_DRAW)
-    {
-        return totalVerts;
-    }
-
-    uint32_t vertsPerDraw = totalVerts;
-
-    switch (topology)
-    {
-    case TOP_POINT_LIST:
-    case TOP_TRIANGLE_LIST:
-        vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
-        break;
-
-    case TOP_PATCHLIST_1:
-    case TOP_PATCHLIST_2:
-    case TOP_PATCHLIST_3:
-    case TOP_PATCHLIST_4:
-    case TOP_PATCHLIST_5:
-    case TOP_PATCHLIST_6:
-    case TOP_PATCHLIST_7:
-    case TOP_PATCHLIST_8:
-    case TOP_PATCHLIST_9:
-    case TOP_PATCHLIST_10:
-    case TOP_PATCHLIST_11:
-    case TOP_PATCHLIST_12:
-    case TOP_PATCHLIST_13:
-    case TOP_PATCHLIST_14:
-    case TOP_PATCHLIST_15:
-    case TOP_PATCHLIST_16:
-    case TOP_PATCHLIST_17:
-    case TOP_PATCHLIST_18:
-    case TOP_PATCHLIST_19:
-    case TOP_PATCHLIST_20:
-    case TOP_PATCHLIST_21:
-    case TOP_PATCHLIST_22:
-    case TOP_PATCHLIST_23:
-    case TOP_PATCHLIST_24:
-    case TOP_PATCHLIST_25:
-    case TOP_PATCHLIST_26:
-    case TOP_PATCHLIST_27:
-    case TOP_PATCHLIST_28:
-    case TOP_PATCHLIST_29:
-    case TOP_PATCHLIST_30:
-    case TOP_PATCHLIST_31:
-    case TOP_PATCHLIST_32:
-        if (pDC->pState->state.tsState.tsEnable)
-        {
-            uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
-            vertsPerDraw          = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
-        }
-        break;
-    default:
-        // We are not splitting up draws for other topologies.
-        break;
-    }
-
-    return vertsPerDraw;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief DrawInstanced
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
-/// @param startVertex - Specifies start vertex for draw. (vertex data)
-/// @param numInstances - How many instances to render.
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer
-/// (instanced data)
-void DrawInstanced(HANDLE             hContext,
-                   PRIMITIVE_TOPOLOGY topology,
-                   uint32_t           numVertices,
-                   uint32_t           startVertex,
-                   uint32_t           numInstances  = 1,
-                   uint32_t           startInstance = 0)
-{
-    if (KNOB_TOSS_DRAW)
-    {
-        return;
-    }
-
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    RDTSC_BEGIN(pContext->pBucketMgr, APIDraw, pDC->drawId);
-
-    uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
-    uint32_t primsPerDraw    = GetNumPrims(topology, maxVertsPerDraw);
-    uint32_t remainingVerts  = numVertices;
-
-    API_STATE* pState  = &pDC->pState->state;
-    pState->topology   = topology;
-    pState->forceFront = false;
-
-    // disable culling for points/lines
-    uint32_t oldCullMode = pState->rastState.cullMode;
-    if (topology == TOP_POINT_LIST)
-    {
-        pState->rastState.cullMode = SWR_CULLMODE_NONE;
-        pState->forceFront         = true;
-    }
-    else if (topology == TOP_RECT_LIST)
-    {
-        pState->rastState.cullMode = SWR_CULLMODE_NONE;
-    }
-
-    int draw = 0;
-    while (remainingVerts)
-    {
-        uint32_t numVertsForDraw =
-            (remainingVerts < maxVertsPerDraw) ? remainingVerts : maxVertsPerDraw;
-
-        bool          isSplitDraw = (draw > 0) ? !KNOB_DISABLE_SPLIT_DRAW : false;
-        DRAW_CONTEXT* pDC         = GetDrawContext(pContext, isSplitDraw);
-        InitDraw(pDC, isSplitDraw);
-
-        pDC->FeWork.type                    = DRAW;
-        pDC->FeWork.pfnWork                 = GetProcessDrawFunc(false, // IsIndexed
-                                                 false, // bEnableCutIndex
-                                                 pState->tsState.tsEnable,
-                                                 pState->gsState.gsEnable,
-                                                 pState->soState.soEnable,
-                                                 pDC->pState->pfnProcessPrims != nullptr);
-        pDC->FeWork.desc.draw.numVerts      = numVertsForDraw;
-        pDC->FeWork.desc.draw.startVertex   = startVertex;
-        pDC->FeWork.desc.draw.numInstances  = numInstances;
-        pDC->FeWork.desc.draw.startInstance = startInstance;
-        pDC->FeWork.desc.draw.startPrimID   = draw * primsPerDraw;
-        pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
-
-        pDC->cleanupState = (remainingVerts == numVertsForDraw);
-
-        // enqueue DC
-        QueueDraw(pContext);
-
-        AR_API_EVENT(DrawInstancedEvent(pDC->drawId,
-                                        topology,
-                                        numVertsForDraw,
-                                        startVertex,
-                                        numInstances,
-                                        startInstance,
-                                        pState->tsState.tsEnable,
-                                        pState->gsState.gsEnable,
-                                        pState->soState.soEnable,
-                                        pState->gsState.outputTopology,
-                                        draw));
-
-        remainingVerts -= numVertsForDraw;
-        draw++;
-    }
-
-    // restore culling state
-    pDC                                   = GetDrawContext(pContext);
-    pDC->pState->state.rastState.cullMode = oldCullMode;
-
-    RDTSC_END(pContext->pBucketMgr, APIDraw, numVertices * numInstances);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDraw
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param startVertex - Specifies start vertex in vertex buffer for draw.
-/// @param primCount - Number of vertices.
-void SwrDraw(HANDLE             hContext,
-             PRIMITIVE_TOPOLOGY topology,
-             uint32_t           startVertex,
-             uint32_t           numVertices)
-{
-    DrawInstanced(hContext, topology, numVertices, startVertex);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDrawInstanced
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
-/// @param numInstances - How many instances to render.
-/// @param startVertex - Specifies start vertex for draw. (vertex data)
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer
-/// (instanced data)
-void SwrDrawInstanced(HANDLE             hContext,
-                      PRIMITIVE_TOPOLOGY topology,
-                      uint32_t           numVertsPerInstance,
-                      uint32_t           numInstances,
-                      uint32_t           startVertex,
-                      uint32_t           startInstance)
-{
-    DrawInstanced(
-        hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief DrawIndexedInstanced
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numIndices - Number of indices to read sequentially from index buffer.
-/// @param indexOffset - Starting index into index buffer.
-/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-/// @param numInstances - Number of instances to render.
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer
-/// (instanced data)
-void DrawIndexedInstance(HANDLE             hContext,
-                         PRIMITIVE_TOPOLOGY topology,
-                         uint32_t           numIndices,
-                         uint32_t           indexOffset,
-                         int32_t            baseVertex,
-                         uint32_t           numInstances  = 1,
-                         uint32_t           startInstance = 0)
-{
-    if (KNOB_TOSS_DRAW)
-    {
-        return;
-    }
-
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-    API_STATE*    pState   = &pDC->pState->state;
-
-    RDTSC_BEGIN(pContext->pBucketMgr, APIDrawIndexed, pDC->drawId);
-
-    uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
-    uint32_t primsPerDraw      = GetNumPrims(topology, maxIndicesPerDraw);
-    uint32_t remainingIndices  = numIndices;
-
-    uint32_t indexSize = 0;
-    switch (pState->indexBuffer.format)
-    {
-    case R32_UINT:
-        indexSize = sizeof(uint32_t);
-        break;
-    case R16_UINT:
-        indexSize = sizeof(uint16_t);
-        break;
-    case R8_UINT:
-        indexSize = sizeof(uint8_t);
-        break;
-    default:
-        SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format);
-    }
-
-    int      draw = 0;
-    gfxptr_t xpIB = pState->indexBuffer.xpIndices;
-    xpIB += (uint64_t)indexOffset * (uint64_t)indexSize;
-
-    pState->topology   = topology;
-    pState->forceFront = false;
-
-    // disable culling for points/lines
-    uint32_t oldCullMode = pState->rastState.cullMode;
-    if (topology == TOP_POINT_LIST)
-    {
-        pState->rastState.cullMode = SWR_CULLMODE_NONE;
-        pState->forceFront         = true;
-    }
-    else if (topology == TOP_RECT_LIST)
-    {
-        pState->rastState.cullMode = SWR_CULLMODE_NONE;
-    }
-
-    while (remainingIndices)
-    {
-        uint32_t numIndicesForDraw =
-            (remainingIndices < maxIndicesPerDraw) ? remainingIndices : maxIndicesPerDraw;
-
-        // When breaking up draw, we need to obtain new draw context for each iteration.
-        bool isSplitDraw = (draw > 0) ? !KNOB_DISABLE_SPLIT_DRAW : false;
-
-        pDC = GetDrawContext(pContext, isSplitDraw);
-        InitDraw(pDC, isSplitDraw);
-
-        pDC->FeWork.type                 = DRAW;
-        pDC->FeWork.pfnWork              = GetProcessDrawFunc(true, // IsIndexed
-                                                 pState->frontendState.bEnableCutIndex,
-                                                 pState->tsState.tsEnable,
-                                                 pState->gsState.gsEnable,
-                                                 pState->soState.soEnable,
-                                                 pDC->pState->pfnProcessPrims != nullptr);
-        pDC->FeWork.desc.draw.pDC        = pDC;
-        pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
-        pDC->FeWork.desc.draw.xpIB       = xpIB;
-        pDC->FeWork.desc.draw.type       = pDC->pState->state.indexBuffer.format;
-
-        pDC->FeWork.desc.draw.numInstances  = numInstances;
-        pDC->FeWork.desc.draw.startInstance = startInstance;
-        pDC->FeWork.desc.draw.baseVertex    = baseVertex;
-        pDC->FeWork.desc.draw.startPrimID   = draw * primsPerDraw;
-
-        pDC->cleanupState = (remainingIndices == numIndicesForDraw);
-
-        // enqueue DC
-        QueueDraw(pContext);
-
-        AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId,
-                                               topology,
-                                               numIndicesForDraw,
-                                               indexOffset,
-                                               baseVertex,
-                                               numInstances,
-                                               startInstance,
-                                               pState->tsState.tsEnable,
-                                               pState->gsState.gsEnable,
-                                               pState->soState.soEnable,
-                                               pState->gsState.outputTopology,
-                                               draw));
-
-        xpIB += maxIndicesPerDraw * indexSize;
-        remainingIndices -= numIndicesForDraw;
-        draw++;
-    }
-
-    // Restore culling state
-    pDC                                   = GetDrawContext(pContext);
-    pDC->pState->state.rastState.cullMode = oldCullMode;
-
-    RDTSC_END(pContext->pBucketMgr, APIDrawIndexed, numIndices * numInstances);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief DrawIndexed
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numIndices - Number of indices to read sequentially from index buffer.
-/// @param indexOffset - Starting index into index buffer.
-/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-void SwrDrawIndexed(HANDLE             hContext,
-                    PRIMITIVE_TOPOLOGY topology,
-                    uint32_t           numIndices,
-                    uint32_t           indexOffset,
-                    int32_t            baseVertex)
-{
-    DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDrawIndexedInstanced
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numIndices - Number of indices to read sequentially from index buffer.
-/// @param numInstances - Number of instances to render.
-/// @param indexOffset - Starting index into index buffer.
-/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer
-/// (instanced data)
-void SwrDrawIndexedInstanced(HANDLE             hContext,
-                             PRIMITIVE_TOPOLOGY topology,
-                             uint32_t           numIndices,
-                             uint32_t           numInstances,
-                             uint32_t           indexOffset,
-                             int32_t            baseVertex,
-                             uint32_t           startInstance)
-{
-    DrawIndexedInstance(
-        hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrInvalidateTiles
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to
-/// invalidate.
-/// @param invalidateRect - The pixel-coordinate rectangle to invalidate.  This will be expanded to
-///                         be hottile size-aligned.
-void SWR_API SwrInvalidateTiles(HANDLE          hContext,
-                                uint32_t        attachmentMask,
-                                const SWR_RECT& invalidateRect)
-{
-    if (KNOB_TOSS_DRAW)
-    {
-        return;
-    }
-
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    pDC->FeWork.type                                       = DISCARDINVALIDATETILES;
-    pDC->FeWork.pfnWork                                    = ProcessDiscardInvalidateTiles;
-    pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
-    pDC->FeWork.desc.discardInvalidateTiles.rect           = invalidateRect;
-    pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
-    pDC->FeWork.desc.discardInvalidateTiles.newTileState   = SWR_TILE_INVALID;
-    pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
-    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly  = false;
-
-    // enqueue
-    QueueDraw(pContext);
-
-    AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDiscardRect
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
-/// @param rect - The pixel-coordinate rectangle to discard.  Only fully-covered hottiles will be
-///               discarded.
-void SWR_API SwrDiscardRect(HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect)
-{
-    if (KNOB_TOSS_DRAW)
-    {
-        return;
-    }
-
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    // Queue a load to the hottile
-    pDC->FeWork.type                                       = DISCARDINVALIDATETILES;
-    pDC->FeWork.pfnWork                                    = ProcessDiscardInvalidateTiles;
-    pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
-    pDC->FeWork.desc.discardInvalidateTiles.rect           = rect;
-    pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
-    pDC->FeWork.desc.discardInvalidateTiles.newTileState   = SWR_TILE_RESOLVED;
-    pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
-    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly  = true;
-
-    // enqueue
-    QueueDraw(pContext);
-
-    AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDispatch
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param threadGroupCountX - Number of thread groups dispatched in X direction
-/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
-/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
-void SwrDispatch(HANDLE   hContext,
-                 uint32_t threadGroupCountX,
-                 uint32_t threadGroupCountY,
-                 uint32_t threadGroupCountZ
-
-)
-{
-    if (KNOB_TOSS_DRAW)
-    {
-        return;
-    }
-
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    RDTSC_BEGIN(pContext->pBucketMgr, APIDispatch, pDC->drawId);
-    AR_API_EVENT(
-        DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
-    pDC->isCompute = true; // This is a compute context.
-
-    COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
-
-    pTaskData->threadGroupCountX = threadGroupCountX;
-    pTaskData->threadGroupCountY = threadGroupCountY;
-    pTaskData->threadGroupCountZ = threadGroupCountZ;
-
-    pTaskData->enableThreadDispatch = false;
-
-    uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
-    uint32_t dcIndex           = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
-    pDC->pDispatch             = &pContext->pDispatchQueueArray[dcIndex];
-    pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE);
-
-    QueueDispatch(pContext);
-    RDTSC_END(pContext->pBucketMgr,
-              APIDispatch,
-              threadGroupCountX * threadGroupCountY * threadGroupCountZ);
-}
-
-// Deswizzles, converts and stores current contents of the hot tiles to surface
-// described by pState
-void SWR_API SwrStoreTiles(HANDLE          hContext,
-                           uint32_t        attachmentMask,
-                           SWR_TILE_STATE  postStoreTileState,
-                           const SWR_RECT& storeRect)
-{
-    if (KNOB_TOSS_DRAW)
-    {
-        return;
-    }
-
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    RDTSC_BEGIN(pContext->pBucketMgr, APIStoreTiles, pDC->drawId);
-
-    pDC->FeWork.type                               = STORETILES;
-    pDC->FeWork.pfnWork                            = ProcessStoreTiles;
-    pDC->FeWork.desc.storeTiles.attachmentMask     = attachmentMask;
-    pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
-    pDC->FeWork.desc.storeTiles.rect               = storeRect;
-    pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect;
-
-    // enqueue
-    QueueDraw(pContext);
-
-    AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId));
-
-    RDTSC_END(pContext->pBucketMgr, APIStoreTiles, 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
-/// @param renderTargetArrayIndex - the RT array index to clear
-/// @param clearColor - color use for clearing render targets
-/// @param z - depth value use for clearing depth buffer
-/// @param stencil - stencil value used for clearing stencil buffer
-/// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
-void SWR_API SwrClearRenderTarget(HANDLE          hContext,
-                                  uint32_t        attachmentMask,
-                                  uint32_t        renderTargetArrayIndex,
-                                  const float     clearColor[4],
-                                  float           z,
-                                  uint8_t         stencil,
-                                  const SWR_RECT& clearRect)
-{
-    if (KNOB_TOSS_DRAW)
-    {
-        return;
-    }
-
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    RDTSC_BEGIN(pContext->pBucketMgr, APIClearRenderTarget, pDC->drawId);
-
-    pDC->FeWork.type            = CLEAR;
-    pDC->FeWork.pfnWork         = ProcessClear;
-    pDC->FeWork.desc.clear.rect = clearRect;
-    pDC->FeWork.desc.clear.rect &= g_MaxScissorRect;
-    pDC->FeWork.desc.clear.attachmentMask         = attachmentMask;
-    pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex;
-    pDC->FeWork.desc.clear.clearDepth             = z;
-    pDC->FeWork.desc.clear.clearRTColor[0]        = clearColor[0];
-    pDC->FeWork.desc.clear.clearRTColor[1]        = clearColor[1];
-    pDC->FeWork.desc.clear.clearRTColor[2]        = clearColor[2];
-    pDC->FeWork.desc.clear.clearRTColor[3]        = clearColor[3];
-    pDC->FeWork.desc.clear.clearStencil           = stencil;
-
-    // enqueue draw
-    QueueDraw(pContext);
-
-    RDTSC_END(pContext->pBucketMgr, APIClearRenderTarget, 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Returns a pointer to the private context state for the current
-///        draw operation. This is used for external componets such as the
-///        sampler.
-///        SWR is responsible for the allocation of the private context state.
-/// @param hContext - Handle passed back from SwrCreateContext
-VOID* SwrGetPrivateContextState(HANDLE hContext)
-{
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-    DRAW_STATE*   pState   = pDC->pState;
-
-    if (pState->pPrivateState == nullptr)
-    {
-        pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize,
-                                                             KNOB_SIMD_WIDTH * sizeof(float));
-    }
-
-    return pState->pPrivateState;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Clients can use this to allocate memory for draw/dispatch
-///        operations. The memory will automatically be freed once operation
-///        has completed. Client can use this to allocate binding tables,
-///        etc. needed for shader execution.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param size - Size of allocation
-/// @param align - Alignment needed for allocation.
-VOID* SwrAllocDrawContextMemory(HANDLE hContext, uint32_t size, uint32_t align)
-{
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    return pDC->pState->pArena->AllocAligned(size, align);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Enables stats counting
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param enable - If true then counts are incremented.
-void SwrEnableStatsFE(HANDLE hContext, bool enable)
-{
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    pDC->pState->state.enableStatsFE = enable;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Enables stats counting
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param enable - If true then counts are incremented.
-void SwrEnableStatsBE(HANDLE hContext, bool enable)
-{
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-
-    pDC->pState->state.enableStatsBE = enable;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Mark end of frame - used for performance profiling
-/// @param hContext - Handle passed back from SwrCreateContext
-void SWR_API SwrEndFrame(HANDLE hContext)
-{
-    SWR_CONTEXT*  pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
-    (void)pDC; // var used
-
-    RDTSC_ENDFRAME(pContext->pBucketMgr);
-    AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId));
-
-    pContext->frameCount++;
-}
-
-void InitSimLoadTilesTable();
-void InitSimStoreTilesTable();
-void InitSimClearTilesTable();
-
-void InitClearTilesTable();
-void InitBackendFuncTables();
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Initialize swr backend and memory internal tables
-void SwrInit()
-{
-    InitClearTilesTable();
-    InitBackendFuncTables();
-    InitRasterizerFunctions();
-}
-
-void SwrGetInterface(SWR_INTERFACE& out_funcs)
-{
-    out_funcs.pfnSwrCreateContext          = SwrCreateContext;
-    out_funcs.pfnSwrDestroyContext         = SwrDestroyContext;
-    out_funcs.pfnSwrBindApiThread          = SwrBindApiThread;
-    out_funcs.pfnSwrSaveState              = SwrSaveState;
-    out_funcs.pfnSwrRestoreState           = SwrRestoreState;
-    out_funcs.pfnSwrSync                   = SwrSync;
-    out_funcs.pfnSwrStallBE                = SwrStallBE;
-    out_funcs.pfnSwrWaitForIdle            = SwrWaitForIdle;
-    out_funcs.pfnSwrWaitForIdleFE          = SwrWaitForIdleFE;
-    out_funcs.pfnSwrSetVertexBuffers       = SwrSetVertexBuffers;
-    out_funcs.pfnSwrSetIndexBuffer         = SwrSetIndexBuffer;
-    out_funcs.pfnSwrSetFetchFunc           = SwrSetFetchFunc;
-    out_funcs.pfnSwrSetSoFunc              = SwrSetSoFunc;
-    out_funcs.pfnSwrSetSoState             = SwrSetSoState;
-    out_funcs.pfnSwrSetSoBuffers           = SwrSetSoBuffers;
-    out_funcs.pfnSwrSetVertexFunc          = SwrSetVertexFunc;
-    out_funcs.pfnSwrSetFrontendState       = SwrSetFrontendState;
-    out_funcs.pfnSwrSetGsState             = SwrSetGsState;
-    out_funcs.pfnSwrSetGsFunc              = SwrSetGsFunc;
-    out_funcs.pfnSwrSetCsFunc              = SwrSetCsFunc;
-    out_funcs.pfnSwrSetTsState             = SwrSetTsState;
-    out_funcs.pfnSwrSetHsFunc              = SwrSetHsFunc;
-    out_funcs.pfnSwrSetDsFunc              = SwrSetDsFunc;
-    out_funcs.pfnSwrSetDepthStencilState   = SwrSetDepthStencilState;
-    out_funcs.pfnSwrSetBackendState        = SwrSetBackendState;
-    out_funcs.pfnSwrSetDepthBoundsState    = SwrSetDepthBoundsState;
-    out_funcs.pfnSwrSetPixelShaderState    = SwrSetPixelShaderState;
-    out_funcs.pfnSwrSetBlendState          = SwrSetBlendState;
-    out_funcs.pfnSwrSetBlendFunc           = SwrSetBlendFunc;
-    out_funcs.pfnSwrDraw                   = SwrDraw;
-    out_funcs.pfnSwrDrawInstanced          = SwrDrawInstanced;
-    out_funcs.pfnSwrDrawIndexed            = SwrDrawIndexed;
-    out_funcs.pfnSwrDrawIndexedInstanced   = SwrDrawIndexedInstanced;
-    out_funcs.pfnSwrInvalidateTiles        = SwrInvalidateTiles;
-    out_funcs.pfnSwrDiscardRect            = SwrDiscardRect;
-    out_funcs.pfnSwrDispatch               = SwrDispatch;
-    out_funcs.pfnSwrStoreTiles             = SwrStoreTiles;
-    out_funcs.pfnSwrClearRenderTarget      = SwrClearRenderTarget;
-    out_funcs.pfnSwrSetRastState           = SwrSetRastState;
-    out_funcs.pfnSwrSetViewports           = SwrSetViewports;
-    out_funcs.pfnSwrSetScissorRects        = SwrSetScissorRects;
-    out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState;
-    out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory;
-    out_funcs.pfnSwrEnableStatsFE          = SwrEnableStatsFE;
-    out_funcs.pfnSwrEnableStatsBE          = SwrEnableStatsBE;
-    out_funcs.pfnSwrEndFrame               = SwrEndFrame;
-    out_funcs.pfnSwrInit                   = SwrInit;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
deleted file mode 100644
index 79e33b01677..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ /dev/null
@@ -1,772 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file api.h
- *
- * @brief API definitions
- *
- ******************************************************************************/
-
-#ifndef __SWR_API_H__
-#define __SWR_API_H__
-
-#include "common/os.h"
-
-#include <assert.h>
-#include <algorithm>
-
-#include "common/intrin.h"
-#include "common/formats.h"
-#include "core/state.h"
-
-typedef void(SWR_API* PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Rectangle structure
-struct SWR_RECT
-{
-    int32_t xmin; ///< inclusive
-    int32_t ymin; ///< inclusive
-    int32_t xmax; ///< exclusive
-    int32_t ymax; ///< exclusive
-
-    bool operator==(const SWR_RECT& rhs)
-    {
-        return (this->ymin == rhs.ymin && this->ymax == rhs.ymax && this->xmin == rhs.xmin &&
-                this->xmax == rhs.xmax);
-    }
-
-    bool operator!=(const SWR_RECT& rhs) { return !(*this == rhs); }
-
-    SWR_RECT& Intersect(const SWR_RECT& other)
-    {
-        this->xmin = std::max(this->xmin, other.xmin);
-        this->ymin = std::max(this->ymin, other.ymin);
-        this->xmax = std::min(this->xmax, other.xmax);
-        this->ymax = std::min(this->ymax, other.ymax);
-
-        if (xmax - xmin < 0 || ymax - ymin < 0)
-        {
-            // Zero area
-            ymin = ymax = xmin = xmax = 0;
-        }
-
-        return *this;
-    }
-    SWR_RECT& operator&=(const SWR_RECT& other) { return Intersect(other); }
-
-    SWR_RECT& Union(const SWR_RECT& other)
-    {
-        this->xmin = std::min(this->xmin, other.xmin);
-        this->ymin = std::min(this->ymin, other.ymin);
-        this->xmax = std::max(this->xmax, other.xmax);
-        this->ymax = std::max(this->ymax, other.ymax);
-
-        return *this;
-    }
-
-    SWR_RECT& operator|=(const SWR_RECT& other) { return Union(other); }
-
-    void Translate(int32_t x, int32_t y)
-    {
-        xmin += x;
-        ymin += y;
-        xmax += x;
-        ymax += y;
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Function signature for load hot tiles
-/// @param hDC - handle to DRAW_CONTEXT
-/// @param dstFormat - format of the hot tile
-/// @param renderTargetIndex - render target to store, can be color, depth or stencil
-/// @param x - destination x coordinate
-/// @param y - destination y coordinate
-/// @param pDstHotTile - pointer to the hot tile surface
-typedef void(SWR_API* PFN_LOAD_TILE)(HANDLE                      hDC,
-                                     HANDLE                      hWorkerPrivateData,
-                                     SWR_FORMAT                  dstFormat,
-                                     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-                                     uint32_t                    x,
-                                     uint32_t                    y,
-                                     uint32_t                    renderTargetArrayIndex,
-                                     uint8_t*                    pDstHotTile);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Function signature for store hot tiles
-/// @param hDC - handle to DRAW_CONTEXT
-/// @param srcFormat - format of the hot tile
-/// @param renderTargetIndex - render target to store, can be color, depth or stencil
-/// @param x - destination x coordinate
-/// @param y - destination y coordinate
-/// @param pSrcHotTile - pointer to the hot tile surface
-typedef void(SWR_API* PFN_STORE_TILE)(HANDLE                      hDC,
-                                      HANDLE                      hWorkerPrivateData,
-                                      SWR_FORMAT                  srcFormat,
-                                      SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-                                      uint32_t                    x,
-                                      uint32_t                    y,
-                                      uint32_t                    renderTargetArrayIndex,
-                                      uint8_t*                    pSrcHotTile);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Function signature for clearing from the hot tiles clear value
-/// @param hPrivateContext - handle to private data
-/// @param renderTargetIndex - render target to store, can be color, depth or stencil
-/// @param x - destination x coordinate
-/// @param y - destination y coordinate
-/// @param renderTargetArrayIndex - render target array offset from arrayIndex
-/// @param pClearColor - pointer to the hot tile's clear value
-typedef void(SWR_API* PFN_CLEAR_TILE)(HANDLE                      hPrivateContext,
-                                      HANDLE                      hWorkerPrivateData,
-                                      SWR_RENDERTARGET_ATTACHMENT rtIndex,
-                                      uint32_t                    x,
-                                      uint32_t                    y,
-                                      uint32_t                    renderTargetArrayIndex,
-                                      const float*                pClearColor);
-
-typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_READ)(HANDLE   hPrivateContext,
-                                                      gfxptr_t xpAddr,
-                                                      bool*    pbNullTileAccessed,
-                                                      HANDLE   hPrivateWorkerData);
-
-typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_WRITE)(HANDLE   hPrivateContext,
-                                                       gfxptr_t xpAddr,
-                                                       bool*    pbNullTileAccessed,
-                                                       HANDLE   hPrivateWorkerData);
-
-typedef gfxptr_t(SWR_API* PFN_MAKE_GFXPTR)(HANDLE hPrivateContext, void* sysAddr);
-
-typedef HANDLE(SWR_API* PFN_CREATE_MEMORY_CONTEXT)(HANDLE hExternalMemory);
-
-typedef void(SWR_API* PFN_DESTROY_MEMORY_CONTEXT)(HANDLE hExternalMemory, HANDLE hMemoryContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Callback to allow driver to update their copy of streamout write offset.
-///        This is call is made for any draw operation that has streamout enabled
-///        and has updated the write offset.
-/// @param hPrivateContext - handle to private data
-/// @param soBufferSlot - buffer slot for write offset
-/// @param soWriteOffset - update value for so write offset.
-typedef void(SWR_API* PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE   hPrivateContext,
-                                                  uint32_t soBufferSlot,
-                                                  uint32_t soWriteOffset);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Callback to allow driver to update their copy of stats.
-/// @param hPrivateContext - handle to private data
-/// @param pStats - pointer to draw stats
-typedef void(SWR_API* PFN_UPDATE_STATS)(HANDLE hPrivateContext, const SWR_STATS* pStats);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Callback to allow driver to update their copy of FE stats.
-/// @note Its optimal to have a separate callback for FE stats since
-///       there is only one DC per FE thread. This means we do not have
-///       to sum up the stats across all of the workers.
-/// @param hPrivateContext - handle to private data
-/// @param pStats - pointer to draw stats
-typedef void(SWR_API* PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext, const SWR_STATS_FE* pStats);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Callback to allow driver to update StreamOut status
-/// @param hPrivateContext - handle to private data
-/// @param numPrims - number of primitives written to StreamOut buffer
-typedef void(SWR_API* PFN_UPDATE_STREAMOUT)(HANDLE hPrivateContext, uint64_t numPrims);
-
-//////////////////////////////////////////////////////////////////////////
-/// BucketManager
-/// Forward Declaration (see rdtsc_buckets.h for full definition)
-/////////////////////////////////////////////////////////////////////////
-class BucketManager;
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_THREADING_INFO
-/////////////////////////////////////////////////////////////////////////
-struct SWR_THREADING_INFO
-{
-    uint32_t BASE_NUMA_NODE;
-    uint32_t BASE_CORE;
-    uint32_t BASE_THREAD;
-    uint32_t MAX_WORKER_THREADS;
-    uint32_t MAX_NUMA_NODES;
-    uint32_t MAX_CORES_PER_NUMA_NODE;
-    uint32_t MAX_THREADS_PER_CORE;
-    bool     SINGLE_THREADED;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_API_THREADING_INFO
-/// Data used to reserve HW threads for API use
-/// API Threads are reserved from numa nodes / cores used for
-/// SWR Worker threads.  Specifying reserved threads here can reduce
-/// the total number of SWR worker threads.
-/////////////////////////////////////////////////////////////////////////
-struct SWR_API_THREADING_INFO
-{
-    uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent
-    uint32_t bindAPIThread0;        // Default is true if numAPIReservedThreads is > 0,
-                                    // binds thread used in SwrCreateContext to API Reserved
-                                    // thread 0
-    uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number.
-                                   // Independent of KNOB_MAX_THREADS_PER_CORE.
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_CONTEXT
-/// Forward Declaration (see context.h for full definition)
-/////////////////////////////////////////////////////////////////////////
-struct SWR_CONTEXT;
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_WORKER_PRIVATE_STATE
-/// Data used to allocate per-worker thread private data.  A pointer
-/// to this data will be passed in to each shader function.
-/// The first field of this private data must be SWR_WORKER_DATA
-/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA 
-/////////////////////////////////////////////////////////////////////////
-struct SWR_WORKER_PRIVATE_STATE
-{
-    typedef void(SWR_API* PFN_WORKER_DATA)(SWR_CONTEXT* pContext, HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
-
-    size_t          perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker
-    PFN_WORKER_DATA pfnInitWorkerData;         ///< Init function for worker data.  If null
-                                               ///< worker data will be initialized to 0.
-    PFN_WORKER_DATA pfnFinishWorkerData;       ///< Finish / destroy function for worker data.
-                                               ///< Can be null.
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_CREATECONTEXT_INFO
-/////////////////////////////////////////////////////////////////////////
-struct SWR_CREATECONTEXT_INFO
-{
-    // External functions (e.g. sampler) need per draw context state.
-    // Use SwrGetPrivateContextState() to access private state.
-    size_t privateStateSize;
-
-    // Optional per-worker state, can be NULL for no worker-private data
-    SWR_WORKER_PRIVATE_STATE* pWorkerPrivateState;
-
-    // Callback functions
-    PFN_LOAD_TILE                  pfnLoadTile;
-    PFN_STORE_TILE                 pfnStoreTile;
-    PFN_TRANSLATE_GFXPTR_FOR_READ  pfnTranslateGfxptrForRead;
-    PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
-    PFN_MAKE_GFXPTR                pfnMakeGfxPtr;
-    PFN_CREATE_MEMORY_CONTEXT      pfnCreateMemoryContext;
-    PFN_DESTROY_MEMORY_CONTEXT     pfnDestroyMemoryContext;
-    PFN_UPDATE_SO_WRITE_OFFSET     pfnUpdateSoWriteOffset;
-    PFN_UPDATE_STATS               pfnUpdateStats;
-    PFN_UPDATE_STATS_FE            pfnUpdateStatsFE;
-    PFN_UPDATE_STREAMOUT           pfnUpdateStreamOut;
-
-
-    // Pointer to rdtsc buckets mgr returned to the caller.
-    // Only populated when KNOB_ENABLE_RDTSC is set
-    BucketManager* pBucketMgr;
-
-    // Output: size required memory passed to for SwrSaveState / SwrRestoreState
-    size_t contextSaveSize;
-
-    // ArchRast event manager.
-    HANDLE hArEventManager;
-
-    // handle to external memory for worker data to create memory contexts
-    HANDLE hExternalMemory;
-
-    // Input (optional): Threading info that overrides any set KNOB values.
-    SWR_THREADING_INFO* pThreadInfo;
-
-    // Input (optional): Info for reserving API threads
-    SWR_API_THREADING_INFO* pApiThreadInfo;
-
-    // Input: if set to non-zero value, overrides KNOB value for maximum
-    // number of draws in flight
-    uint32_t MAX_DRAWS_IN_FLIGHT;
-
-    std::string contextName;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Create SWR Context.
-/// @param pCreateInfo - pointer to creation info.
-SWR_FUNC(HANDLE, SwrCreateContext, SWR_CREATECONTEXT_INFO* pCreateInfo);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Destroys SWR Context.
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrDestroyContext, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bind current thread to an API reserved HW thread
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param apiThreadId - index of reserved HW thread to bind to.
-SWR_FUNC(void, SwrBindApiThread, HANDLE hContext, uint32_t apiThreadId);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Saves API state associated with hContext
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pOutputStateBlock - Memory block to receive API state data
-/// @param memSize - Size of memory pointed to by pOutputStateBlock
-SWR_FUNC(void, SwrSaveState, HANDLE hContext, void* pOutputStateBlock, size_t memSize);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Restores API state to hContext previously saved with SwrSaveState
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pStateBlock - Memory block to read API state data from
-/// @param memSize - Size of memory pointed to by pStateBlock
-SWR_FUNC(void, SwrRestoreState, HANDLE hContext, const void* pStateBlock, size_t memSize);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Sync cmd. Executes the callback func when all rendering up to this sync
-///        has been completed
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnFunc - pointer to callback function,
-/// @param userData - user data to pass back
-SWR_FUNC(void,
-         SwrSync,
-         HANDLE            hContext,
-         PFN_CALLBACK_FUNC pfnFunc,
-         uint64_t          userData,
-         uint64_t          userData2,
-         uint64_t          userData3);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Stall cmd. Stalls the backend until all previous work has been completed.
-///        Frontend work can continue to make progress
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrStallBE, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Blocks until all rendering has been completed.
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrWaitForIdle, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Blocks until all FE rendering has been completed.
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrWaitForIdleFE, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set vertex buffer state.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param numBuffers - Number of vertex buffer state descriptors.
-/// @param pVertexBuffers - Array of vertex buffer state descriptors.
-SWR_FUNC(void,
-         SwrSetVertexBuffers,
-         HANDLE                         hContext,
-         uint32_t                       numBuffers,
-         const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set index buffer
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pIndexBuffer - Index buffer.
-SWR_FUNC(void, SwrSetIndexBuffer, HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set fetch shader pointer.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnFetchFunc - Pointer to shader.
-SWR_FUNC(void, SwrSetFetchFunc, HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set streamout shader pointer.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnSoFunc - Pointer to shader.
-/// @param streamIndex - specifies stream
-SWR_FUNC(void, SwrSetSoFunc, HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set streamout state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pSoState - Pointer to streamout state.
-SWR_FUNC(void, SwrSetSoState, HANDLE hContext, SWR_STREAMOUT_STATE* pSoState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set streamout buffer state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pSoBuffer - Pointer to streamout buffer.
-/// @param slot - Slot to bind SO buffer to.
-SWR_FUNC(void, SwrSetSoBuffers, HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set vertex shader pointer.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnVertexFunc - Pointer to shader.
-SWR_FUNC(void, SwrSetVertexFunc, HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set frontend state.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state
-SWR_FUNC(void, SwrSetFrontendState, HANDLE hContext, SWR_FRONTEND_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set geometry shader state.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state
-SWR_FUNC(void, SwrSetGsState, HANDLE hContext, SWR_GS_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set geometry shader
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to geometry shader function
-SWR_FUNC(void, SwrSetGsFunc, HANDLE hContext, PFN_GS_FUNC pfnGsFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set compute shader
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnCsFunc - Pointer to compute shader function
-/// @param totalThreadsInGroup - product of thread group dimensions.
-/// @param totalSpillFillSize - size in bytes needed for spill/fill.
-/// @param scratchSpaceSizePerInstance - size of the scratch space needed per simd instance
-/// @param numInstances - number of simd instances that are run per execution of the shader
-SWR_FUNC(void,
-         SwrSetCsFunc,
-         HANDLE      hContext,
-         PFN_CS_FUNC pfnCsFunc,
-         uint32_t    totalThreadsInGroup,
-         uint32_t    totalSpillFillSize,
-         uint32_t    scratchSpaceSizePerInstance,
-         uint32_t    numInstances);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set tessellation state.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state
-SWR_FUNC(void, SwrSetTsState, HANDLE hContext, SWR_TS_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set hull shader
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnFunc - Pointer to shader function
-SWR_FUNC(void, SwrSetHsFunc, HANDLE hContext, PFN_HS_FUNC pfnFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set domain shader
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnFunc - Pointer to shader function
-SWR_FUNC(void, SwrSetDsFunc, HANDLE hContext, PFN_DS_FUNC pfnFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set depth stencil state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetDepthStencilState, HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set backend state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetBackendState, HANDLE hContext, SWR_BACKEND_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set depth bounds state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetDepthBoundsState, HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set pixel shader state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetPixelShaderState, HANDLE hContext, SWR_PS_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set blend state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetBlendState, HANDLE hContext, SWR_BLEND_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set blend function
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param renderTarget - render target index
-/// @param pfnBlendFunc - function pointer
-SWR_FUNC(
-    void, SwrSetBlendFunc, HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDraw
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param startVertex - Specifies start vertex in vertex buffer for draw.
-/// @param primCount - Number of vertices.
-SWR_FUNC(void,
-         SwrDraw,
-         HANDLE             hContext,
-         PRIMITIVE_TOPOLOGY topology,
-         uint32_t           startVertex,
-         uint32_t           primCount);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDrawInstanced
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
-/// @param numInstances - How many instances to render.
-/// @param startVertex - Specifies start vertex for draw. (vertex data)
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer
-/// (instanced data)
-SWR_FUNC(void,
-         SwrDrawInstanced,
-         HANDLE             hContext,
-         PRIMITIVE_TOPOLOGY topology,
-         uint32_t           numVertsPerInstance,
-         uint32_t           numInstances,
-         uint32_t           startVertex,
-         uint32_t           startInstance);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief DrawIndexed
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numIndices - Number of indices to read sequentially from index buffer.
-/// @param indexOffset - Starting index into index buffer.
-/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-SWR_FUNC(void,
-         SwrDrawIndexed,
-         HANDLE             hContext,
-         PRIMITIVE_TOPOLOGY topology,
-         uint32_t           numIndices,
-         uint32_t           indexOffset,
-         int32_t            baseVertex);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDrawIndexedInstanced
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numIndices - Number of indices to read sequentially from index buffer.
-/// @param numInstances - Number of instances to render.
-/// @param indexOffset - Starting index into index buffer.
-/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer
-/// (instanced data)
-SWR_FUNC(void,
-         SwrDrawIndexedInstanced,
-         HANDLE             hContext,
-         PRIMITIVE_TOPOLOGY topology,
-         uint32_t           numIndices,
-         uint32_t           numInstances,
-         uint32_t           indexOffset,
-         int32_t            baseVertex,
-         uint32_t           startInstance);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrInvalidateTiles
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to
-/// invalidate.
-/// @param invalidateRect - The pixel-coordinate rectangle to invalidate.  This will be expanded to
-///                         be hottile size-aligned.
-SWR_FUNC(void,
-         SwrInvalidateTiles,
-         HANDLE          hContext,
-         uint32_t        attachmentMask,
-         const SWR_RECT& invalidateRect);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDiscardRect
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
-/// @param rect - The pixel-coordinate rectangle to discard.  Only fully-covered hottiles will be
-///               discarded.
-SWR_FUNC(void, SwrDiscardRect, HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDispatch
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param threadGroupCountX - Number of thread groups dispatched in X direction
-/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
-/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
-SWR_FUNC(void,
-         SwrDispatch,
-         HANDLE   hContext,
-         uint32_t threadGroupCountX,
-         uint32_t threadGroupCountY,
-         uint32_t threadGroupCountZ);
-
-/// @note this enum needs to be kept in sync with HOTTILE_STATE!
-enum SWR_TILE_STATE
-{
-    SWR_TILE_INVALID = 0, // tile is in uninitialized state and should be loaded with surface contents
-                          // before rendering
-    SWR_TILE_DIRTY    = 2, // tile contains newer data than surface it represents
-    SWR_TILE_RESOLVED = 3, // is in sync with surface it represents
-};
-
-/// @todo Add a good description for what attachments are and when and why you would use the
-/// different SWR_TILE_STATEs.
-SWR_FUNC(void,
-         SwrStoreTiles,
-         HANDLE          hContext,
-         uint32_t        attachmentMask,
-         SWR_TILE_STATE  postStoreTileState,
-         const SWR_RECT& storeRect);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
-/// @param renderTargetArrayIndex - the RT array index to clear
-/// @param clearColor - color use for clearing render targets
-/// @param z - depth value use for clearing depth buffer
-/// @param stencil - stencil value used for clearing stencil buffer
-/// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
-SWR_FUNC(void,
-         SwrClearRenderTarget,
-         HANDLE          hContext,
-         uint32_t        attachmentMask,
-         uint32_t        renderTargetArrayIndex,
-         const float     clearColor[4],
-         float           z,
-         uint8_t         stencil,
-         const SWR_RECT& clearRect);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrSetRastState
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pRastState - New SWR_RASTSTATE used for SwrDraw* commands
-SWR_FUNC(void, SwrSetRastState, HANDLE hContext, const SWR_RASTSTATE* pRastState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrSetViewports
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param numViewports - number of viewports passed in
-/// @param pViewports - Specifies extents of viewport.
-/// @param pMatrices - If not specified then SWR computes a default one.
-SWR_FUNC(void,
-         SwrSetViewports,
-         HANDLE                       hContext,
-         uint32_t                     numViewports,
-         const SWR_VIEWPORT*          pViewports,
-         const SWR_VIEWPORT_MATRICES* pMatrices);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrSetScissorRects
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param numScissors - number of scissors passed in
-/// @param pScissors - array of scissors
-SWR_FUNC(
-    void, SwrSetScissorRects, HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Returns a pointer to the private context state for the current
-///        draw operation. This is used for external componets such as the
-///        sampler.
-///
-/// @note  Client needs to resend private state prior to each draw call.
-///        Also, SWR is responsible for the private state memory.
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void*, SwrGetPrivateContextState, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Clients can use this to allocate memory for draw/dispatch
-///        operations. The memory will automatically be freed once operation
-///        has completed. Client can use this to allocate binding tables,
-///        etc. needed for shader execution.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param size - Size of allocation
-/// @param align - Alignment needed for allocation.
-SWR_FUNC(void*, SwrAllocDrawContextMemory, HANDLE hContext, uint32_t size, uint32_t align);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Enables stats counting
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param enable - If true then counts are incremented.
-SWR_FUNC(void, SwrEnableStatsFE, HANDLE hContext, bool enable);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Enables stats counting
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param enable - If true then counts are incremented.
-SWR_FUNC(void, SwrEnableStatsBE, HANDLE hContext, bool enable);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Mark end of frame - used for performance profiling
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrEndFrame, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Initialize swr backend and memory internal tables
-SWR_FUNC(void, SwrInit);
-
-
-struct SWR_INTERFACE
-{
-    PFNSwrCreateContext          pfnSwrCreateContext;
-    PFNSwrDestroyContext         pfnSwrDestroyContext;
-    PFNSwrBindApiThread          pfnSwrBindApiThread;
-    PFNSwrSaveState              pfnSwrSaveState;
-    PFNSwrRestoreState           pfnSwrRestoreState;
-    PFNSwrSync                   pfnSwrSync;
-    PFNSwrStallBE                pfnSwrStallBE;
-    PFNSwrWaitForIdle            pfnSwrWaitForIdle;
-    PFNSwrWaitForIdleFE          pfnSwrWaitForIdleFE;
-    PFNSwrSetVertexBuffers       pfnSwrSetVertexBuffers;
-    PFNSwrSetIndexBuffer         pfnSwrSetIndexBuffer;
-    PFNSwrSetFetchFunc           pfnSwrSetFetchFunc;
-    PFNSwrSetSoFunc              pfnSwrSetSoFunc;
-    PFNSwrSetSoState             pfnSwrSetSoState;
-    PFNSwrSetSoBuffers           pfnSwrSetSoBuffers;
-    PFNSwrSetVertexFunc          pfnSwrSetVertexFunc;
-    PFNSwrSetFrontendState       pfnSwrSetFrontendState;
-    PFNSwrSetGsState             pfnSwrSetGsState;
-    PFNSwrSetGsFunc              pfnSwrSetGsFunc;
-    PFNSwrSetCsFunc              pfnSwrSetCsFunc;
-    PFNSwrSetTsState             pfnSwrSetTsState;
-    PFNSwrSetHsFunc              pfnSwrSetHsFunc;
-    PFNSwrSetDsFunc              pfnSwrSetDsFunc;
-    PFNSwrSetDepthStencilState   pfnSwrSetDepthStencilState;
-    PFNSwrSetBackendState        pfnSwrSetBackendState;
-    PFNSwrSetDepthBoundsState    pfnSwrSetDepthBoundsState;
-    PFNSwrSetPixelShaderState    pfnSwrSetPixelShaderState;
-    PFNSwrSetBlendState          pfnSwrSetBlendState;
-    PFNSwrSetBlendFunc           pfnSwrSetBlendFunc;
-    PFNSwrDraw                   pfnSwrDraw;
-    PFNSwrDrawInstanced          pfnSwrDrawInstanced;
-    PFNSwrDrawIndexed            pfnSwrDrawIndexed;
-    PFNSwrDrawIndexedInstanced   pfnSwrDrawIndexedInstanced;
-    PFNSwrInvalidateTiles        pfnSwrInvalidateTiles;
-    PFNSwrDiscardRect            pfnSwrDiscardRect;
-    PFNSwrDispatch               pfnSwrDispatch;
-    PFNSwrStoreTiles             pfnSwrStoreTiles;
-    PFNSwrClearRenderTarget      pfnSwrClearRenderTarget;
-    PFNSwrSetRastState           pfnSwrSetRastState;
-    PFNSwrSetViewports           pfnSwrSetViewports;
-    PFNSwrSetScissorRects        pfnSwrSetScissorRects;
-    PFNSwrGetPrivateContextState pfnSwrGetPrivateContextState;
-    PFNSwrAllocDrawContextMemory pfnSwrAllocDrawContextMemory;
-    PFNSwrEnableStatsFE          pfnSwrEnableStatsFE;
-    PFNSwrEnableStatsBE          pfnSwrEnableStatsBE;
-    PFNSwrEndFrame               pfnSwrEndFrame;
-    PFNSwrInit                   pfnSwrInit;
-};
-
-extern "C" {
-typedef void(SWR_API* PFNSwrGetInterface)(SWR_INTERFACE& out_funcs);
-SWR_VISIBLE void SWR_API SwrGetInterface(SWR_INTERFACE& out_funcs);
-}
-
-#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
deleted file mode 100644
index 831617c213f..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file arena.h
- *
- * @brief Arena memory manager
- *        The arena is convenient and fast for managing allocations for any of
- *        our allocations that are associated with operations and can all be freed
- *        once when their operation has completed. Allocations are cheap since
- *        most of the time its simply an increment of an offset. Also, no need to
- *        free individual allocations. All of the arena memory can be freed at once.
- *
- ******************************************************************************/
-#pragma once
-
-#include <mutex>
-#include <algorithm>
-#include <atomic>
-#include "core/utils.h"
-
-static const size_t ARENA_BLOCK_ALIGN = 64;
-
-struct ArenaBlock
-{
-    size_t      blockSize = 0;
-    ArenaBlock* pNext     = nullptr;
-};
-static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size");
-
-class DefaultAllocator
-{
-public:
-    ArenaBlock* AllocateAligned(size_t size, size_t align)
-    {
-        SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock));
-
-        ArenaBlock* p = new (AlignedMalloc(size, align)) ArenaBlock();
-        p->blockSize  = size;
-        return p;
-    }
-
-    void Free(ArenaBlock* pMem)
-    {
-        if (pMem)
-        {
-            SWR_ASSUME_ASSERT(pMem->blockSize < size_t(0xdddddddd));
-            AlignedFree(pMem);
-        }
-    }
-};
-
-// Caching Allocator for Arena
-template <uint32_t NumBucketsT = 8, uint32_t StartBucketBitT = 12>
-struct CachingAllocatorT : DefaultAllocator
-{
-    ArenaBlock* AllocateAligned(size_t size, size_t align)
-    {
-        SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock));
-        SWR_ASSUME_ASSERT(size <= uint32_t(-1));
-
-        uint32_t bucket = GetBucketId(size);
-
-        {
-            // search cached blocks
-            std::lock_guard<std::mutex> l(m_mutex);
-            ArenaBlock*                 pPrevBlock = &m_cachedBlocks[bucket];
-            ArenaBlock*                 pBlock     = SearchBlocks(pPrevBlock, size, align);
-
-            if (pBlock)
-            {
-                m_cachedSize -= pBlock->blockSize;
-                if (pBlock == m_pLastCachedBlocks[bucket])
-                {
-                    m_pLastCachedBlocks[bucket] = pPrevBlock;
-                }
-            }
-            else
-            {
-                pPrevBlock = &m_oldCachedBlocks[bucket];
-                pBlock     = SearchBlocks(pPrevBlock, size, align);
-
-                if (pBlock)
-                {
-                    m_oldCachedSize -= pBlock->blockSize;
-                    if (pBlock == m_pOldLastCachedBlocks[bucket])
-                    {
-                        m_pOldLastCachedBlocks[bucket] = pPrevBlock;
-                    }
-                }
-            }
-
-            if (pBlock)
-            {
-                assert(pPrevBlock && pPrevBlock->pNext == pBlock);
-                pPrevBlock->pNext = pBlock->pNext;
-                pBlock->pNext     = nullptr;
-
-                return pBlock;
-            }
-
-            m_totalAllocated += size;
-
-#if 0
-            {
-                static uint32_t count = 0;
-                char buf[128];
-                sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated));
-                OutputDebugStringA(buf);
-            }
-#endif
-        }
-
-        if (bucket && bucket < (CACHE_NUM_BUCKETS - 1))
-        {
-            // Make all blocks in this bucket the same size
-            size = size_t(1) << (bucket + 1 + CACHE_START_BUCKET_BIT);
-        }
-
-        return this->DefaultAllocator::AllocateAligned(size, align);
-    }
-
-    void Free(ArenaBlock* pMem)
-    {
-        if (pMem)
-        {
-            std::unique_lock<std::mutex> l(m_mutex);
-            InsertCachedBlock(GetBucketId(pMem->blockSize), pMem);
-        }
-    }
-
-    void FreeOldBlocks()
-    {
-        if (!m_cachedSize)
-        {
-            return;
-        }
-        std::lock_guard<std::mutex> l(m_mutex);
-
-        bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE);
-
-        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
-        {
-            if (doFree)
-            {
-                ArenaBlock* pBlock = m_oldCachedBlocks[i].pNext;
-                while (pBlock)
-                {
-                    ArenaBlock* pNext = pBlock->pNext;
-                    m_oldCachedSize -= pBlock->blockSize;
-                    m_totalAllocated -= pBlock->blockSize;
-                    this->DefaultAllocator::Free(pBlock);
-                    pBlock = pNext;
-                }
-                m_oldCachedBlocks[i].pNext = nullptr;
-                m_pOldLastCachedBlocks[i]  = &m_oldCachedBlocks[i];
-            }
-
-            if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i])
-            {
-                if (i && i < (CACHE_NUM_BUCKETS - 1))
-                {
-                    // We know that all blocks are the same size.
-                    // Just move the list over.
-                    m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext;
-                    m_oldCachedBlocks[i].pNext    = m_cachedBlocks[i].pNext;
-                    m_cachedBlocks[i].pNext       = nullptr;
-                    if (m_pOldLastCachedBlocks[i]->pNext)
-                    {
-                        m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i];
-                    }
-                    m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
-                }
-                else
-                {
-                    // The end buckets can have variable sized lists.
-                    // Insert each block based on size
-                    ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
-                    while (pBlock)
-                    {
-                        ArenaBlock* pNext = pBlock->pNext;
-                        pBlock->pNext     = nullptr;
-                        m_cachedSize -= pBlock->blockSize;
-                        InsertCachedBlock<true>(i, pBlock);
-                        pBlock = pNext;
-                    }
-
-                    m_pLastCachedBlocks[i]  = &m_cachedBlocks[i];
-                    m_cachedBlocks[i].pNext = nullptr;
-                }
-            }
-        }
-
-        m_oldCachedSize += m_cachedSize;
-        m_cachedSize = 0;
-    }
-
-    CachingAllocatorT()
-    {
-        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
-        {
-            m_pLastCachedBlocks[i]    = &m_cachedBlocks[i];
-            m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
-        }
-    }
-
-    ~CachingAllocatorT()
-    {
-        // Free all cached blocks
-        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
-        {
-            ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
-            while (pBlock)
-            {
-                ArenaBlock* pNext = pBlock->pNext;
-                this->DefaultAllocator::Free(pBlock);
-                pBlock = pNext;
-            }
-            pBlock = m_oldCachedBlocks[i].pNext;
-            while (pBlock)
-            {
-                ArenaBlock* pNext = pBlock->pNext;
-                this->DefaultAllocator::Free(pBlock);
-                pBlock = pNext;
-            }
-        }
-    }
-
-private:
-    static uint32_t GetBucketId(size_t blockSize)
-    {
-        uint32_t bucketId = 0;
-
-#if defined(BitScanReverseSizeT)
-        BitScanReverseSizeT((unsigned long*)&bucketId, (blockSize - 1) >> CACHE_START_BUCKET_BIT);
-        bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
-#endif
-
-        return bucketId;
-    }
-
-    template <bool OldBlockT = false>
-    void InsertCachedBlock(uint32_t bucketId, ArenaBlock* pNewBlock)
-    {
-        SWR_ASSUME_ASSERT(bucketId < CACHE_NUM_BUCKETS);
-
-        ArenaBlock* pPrevBlock =
-            OldBlockT ? &m_oldCachedBlocks[bucketId] : &m_cachedBlocks[bucketId];
-        ArenaBlock* pBlock = pPrevBlock->pNext;
-
-        while (pBlock)
-        {
-            if (pNewBlock->blockSize >= pBlock->blockSize)
-            {
-                // Insert here
-                break;
-            }
-            pPrevBlock = pBlock;
-            pBlock     = pBlock->pNext;
-        }
-
-        // Insert into list
-        SWR_ASSUME_ASSERT(pPrevBlock);
-        pPrevBlock->pNext = pNewBlock;
-        pNewBlock->pNext  = pBlock;
-
-        if (OldBlockT)
-        {
-            if (m_pOldLastCachedBlocks[bucketId] == pPrevBlock)
-            {
-                m_pOldLastCachedBlocks[bucketId] = pNewBlock;
-            }
-
-            m_oldCachedSize += pNewBlock->blockSize;
-        }
-        else
-        {
-            if (m_pLastCachedBlocks[bucketId] == pPrevBlock)
-            {
-                m_pLastCachedBlocks[bucketId] = pNewBlock;
-            }
-
-            m_cachedSize += pNewBlock->blockSize;
-        }
-    }
-
-    static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align)
-    {
-        ArenaBlock* pBlock          = pPrevBlock->pNext;
-        ArenaBlock* pPotentialBlock = nullptr;
-        ArenaBlock* pPotentialPrev  = nullptr;
-
-        while (pBlock)
-        {
-            if (pBlock->blockSize >= blockSize)
-            {
-                if (pBlock == AlignUp(pBlock, align))
-                {
-                    if (pBlock->blockSize == blockSize)
-                    {
-                        // Won't find a better match
-                        break;
-                    }
-
-                    // We could use this as it is larger than we wanted, but
-                    // continue to search for a better match
-                    pPotentialBlock = pBlock;
-                    pPotentialPrev  = pPrevBlock;
-                }
-            }
-            else
-            {
-                // Blocks are sorted by size (biggest first)
-                // So, if we get here, there are no blocks
-                // large enough, fall through to allocation.
-                pBlock = nullptr;
-                break;
-            }
-
-            pPrevBlock = pBlock;
-            pBlock     = pBlock->pNext;
-        }
-
-        if (!pBlock)
-        {
-            // Couldn't find an exact match, use next biggest size
-            pBlock     = pPotentialBlock;
-            pPrevBlock = pPotentialPrev;
-        }
-
-        return pBlock;
-    }
-
-    // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
-    static const uint32_t CACHE_NUM_BUCKETS      = NumBucketsT;
-    static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT;
-    static const size_t   MAX_UNUSED_SIZE        = sizeof(MEGABYTE);
-
-    ArenaBlock  m_cachedBlocks[CACHE_NUM_BUCKETS];
-    ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS];
-    ArenaBlock  m_oldCachedBlocks[CACHE_NUM_BUCKETS];
-    ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS];
-    std::mutex  m_mutex;
-
-    size_t m_totalAllocated = 0;
-
-    size_t m_cachedSize    = 0;
-    size_t m_oldCachedSize = 0;
-};
-typedef CachingAllocatorT<> CachingAllocator;
-
-template <typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)>
-class TArena
-{
-public:
-    TArena(T& in_allocator) : m_allocator(in_allocator) {}
-    TArena() : m_allocator(m_defAllocator) {}
-    ~TArena() { Reset(true); }
-
-    void* AllocAligned(size_t size, size_t align)
-    {
-        if (0 == size)
-        {
-            return nullptr;
-        }
-
-        SWR_ASSERT(align <= ARENA_BLOCK_ALIGN);
-
-        if (m_pCurBlock)
-        {
-            ArenaBlock* pCurBlock = m_pCurBlock;
-            size_t      offset    = AlignUp(m_offset, align);
-
-            if ((offset + size) <= pCurBlock->blockSize)
-            {
-                void* pMem = PtrAdd(pCurBlock, offset);
-                m_offset   = offset + size;
-                return pMem;
-            }
-
-            // Not enough memory in this block, fall through to allocate
-            // a new block
-        }
-
-        static const size_t ArenaBlockSize = BlockSizeT;
-        size_t              blockSize      = std::max(size + ARENA_BLOCK_ALIGN, ArenaBlockSize);
-
-        // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
-        blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
-
-        ArenaBlock* pNewBlock = m_allocator.AllocateAligned(
-            blockSize, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned.
-        SWR_ASSERT(pNewBlock != nullptr);
-
-        if (pNewBlock != nullptr)
-        {
-            m_offset         = ARENA_BLOCK_ALIGN;
-            pNewBlock->pNext = m_pCurBlock;
-
-            m_pCurBlock = pNewBlock;
-        }
-
-        return AllocAligned(size, align);
-    }
-
-    void* Alloc(size_t size) { return AllocAligned(size, 1); }
-
-    void* AllocAlignedSync(size_t size, size_t align)
-    {
-        void* pAlloc = nullptr;
-
-        m_mutex.lock();
-        pAlloc = AllocAligned(size, align);
-        m_mutex.unlock();
-
-        return pAlloc;
-    }
-
-    void* AllocSync(size_t size)
-    {
-        void* pAlloc = nullptr;
-
-        m_mutex.lock();
-        pAlloc = Alloc(size);
-        m_mutex.unlock();
-
-        return pAlloc;
-    }
-
-    void Reset(bool removeAll = false)
-    {
-        m_offset = ARENA_BLOCK_ALIGN;
-
-        if (m_pCurBlock)
-        {
-            ArenaBlock* pUsedBlocks = m_pCurBlock->pNext;
-            m_pCurBlock->pNext      = nullptr;
-            while (pUsedBlocks)
-            {
-                ArenaBlock* pBlock = pUsedBlocks;
-                pUsedBlocks        = pBlock->pNext;
-
-                m_allocator.Free(pBlock);
-            }
-
-            if (removeAll)
-            {
-                m_allocator.Free(m_pCurBlock);
-                m_pCurBlock = nullptr;
-            }
-        }
-    }
-
-    bool IsEmpty()
-    {
-        return (m_pCurBlock == nullptr) ||
-               (m_offset == ARENA_BLOCK_ALIGN && m_pCurBlock->pNext == nullptr);
-    }
-
-private:
-    ArenaBlock* m_pCurBlock = nullptr;
-    size_t      m_offset    = ARENA_BLOCK_ALIGN;
-
-    /// @note Mutex is only used by sync allocation functions.
-    std::mutex m_mutex;
-
-    DefaultAllocator m_defAllocator;
-    T&               m_allocator;
-};
-
-using StdArena     = TArena<DefaultAllocator>;
-using CachingArena = TArena<CachingAllocator>;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
deleted file mode 100644
index bb9d6f7dc52..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ /dev/null
@@ -1,420 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.cpp
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-
-#include <smmintrin.h>
-
-#include "backend.h"
-#include "backend_impl.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-#include "core/multisample.h"
-#include "backends/gen_BackendPixelRate.hpp"
-
-#include <algorithm>
-
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Process compute work.
-/// @param pDC - pointer to draw context (dispatch).
-/// @param workerId - The unique worker ID that is assigned to this thread.
-/// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC,
-                      uint32_t      workerId,
-                      uint32_t      threadGroupId,
-                      void*&        pSpillFillBuffer,
-                      void*&        pScratchSpace)
-{
-    SWR_CONTEXT* pContext = pDC->pContext;
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEDispatch, pDC->drawId);
-
-    const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
-    SWR_ASSERT(pTaskData != nullptr);
-
-    // Ensure spill fill memory has been allocated.
-    size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
-    if (spillFillSize && pSpillFillBuffer == nullptr)
-    {
-        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD16_BYTES);
-    }
-
-    size_t scratchSpaceSize =
-        pDC->pState->state.scratchSpaceSizePerWarp * pDC->pState->state.scratchSpaceNumWarps;
-    if (scratchSpaceSize && pScratchSpace == nullptr)
-    {
-        pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD16_BYTES);
-    }
-
-    const API_STATE& state = GetApiState(pDC);
-
-    SWR_CS_CONTEXT csContext{0};
-    csContext.tileCounter         = threadGroupId;
-    csContext.dispatchDims[0]     = pTaskData->threadGroupCountX;
-    csContext.dispatchDims[1]     = pTaskData->threadGroupCountY;
-    csContext.dispatchDims[2]     = pTaskData->threadGroupCountZ;
-    csContext.pTGSM               = pContext->ppScratch[workerId];
-    csContext.pSpillFillBuffer    = (uint8_t*)pSpillFillBuffer;
-    csContext.pScratchSpace       = (uint8_t*)pScratchSpace;
-    csContext.scratchSpacePerWarp = pDC->pState->state.scratchSpaceSizePerWarp;
-
-    state.pfnCsFunc(GetPrivateState(pDC),
-                    pContext->threadPool.pThreadData[workerId].pWorkerPrivateData,
-                    &csContext);
-
-    UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
-    AR_EVENT(CSStats((HANDLE)&csContext.stats));
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BEDispatch, 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Process shutdown.
-/// @param pDC - pointer to draw context (dispatch).
-/// @param workerId - The unique worker ID that is assigned to this thread.
-/// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
-{
-    // Dummy function
-}
-
-void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
-{
-    uint32_t x, y;
-    MacroTileMgr::getTileIndices(macroTile, x, y);
-    SWR_ASSERT(x == 0 && y == 0);
-}
-
-void ProcessStoreTileBE(DRAW_CONTEXT*               pDC,
-                        uint32_t                    workerId,
-                        uint32_t                    macroTile,
-                        STORE_TILES_DESC*           pDesc,
-                        SWR_RENDERTARGET_ATTACHMENT attachment)
-{
-    SWR_CONTEXT* pContext           = pDC->pContext;
-    HANDLE       hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStoreTiles, pDC->drawId);
-
-    SWR_FORMAT srcFormat;
-    switch (attachment)
-    {
-    case SWR_ATTACHMENT_COLOR0:
-    case SWR_ATTACHMENT_COLOR1:
-    case SWR_ATTACHMENT_COLOR2:
-    case SWR_ATTACHMENT_COLOR3:
-    case SWR_ATTACHMENT_COLOR4:
-    case SWR_ATTACHMENT_COLOR5:
-    case SWR_ATTACHMENT_COLOR6:
-    case SWR_ATTACHMENT_COLOR7:
-        srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
-        break;
-    case SWR_ATTACHMENT_DEPTH:
-        srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT;
-        break;
-    case SWR_ATTACHMENT_STENCIL:
-        srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT;
-        break;
-    default:
-        SWR_INVALID("Unknown attachment: %d", attachment);
-        srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
-        break;
-    }
-
-    uint32_t x, y;
-    MacroTileMgr::getTileIndices(macroTile, x, y);
-
-    // Only need to store the hottile if it's been rendered to...
-    HOTTILE* pHotTile =
-        pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
-    if (pHotTile)
-    {
-        // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
-        if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            pfnClearTiles(pDC,
-                          hWorkerPrivateData,
-                          attachment,
-                          macroTile,
-                          pHotTile->renderTargetArrayIndex,
-                          pHotTile->clearData,
-                          pDesc->rect);
-        }
-
-        if (pHotTile->state == HOTTILE_DIRTY ||
-            pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
-        {
-            int32_t destX = KNOB_MACROTILE_X_DIM * x;
-            int32_t destY = KNOB_MACROTILE_Y_DIM * y;
-
-            pContext->pfnStoreTile(pDC,
-                                   hWorkerPrivateData,
-                                   srcFormat,
-                                   attachment,
-                                   destX,
-                                   destY,
-                                   pHotTile->renderTargetArrayIndex,
-                                   pHotTile->pBuffer);
-        }
-
-        if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) 
-        {
-            if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY &&
-                  pHotTile->state == HOTTILE_RESOLVED))
-            {
-                pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
-            }
-        }
-    }
-    RDTSC_END(pDC->pContext->pBucketMgr, BEStoreTiles, 1);
-}
-
-void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
-    STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pData;
-
-    unsigned long rt   = 0;
-    uint32_t      mask = pDesc->attachmentMask;
-    while (_BitScanForward(&rt, mask))
-    {
-        mask &= ~(1 << rt);
-        ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
-    }
-}
-
-void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
-                                     uint32_t      workerId,
-                                     uint32_t      macroTile,
-                                     void*         pData)
-{
-    DISCARD_INVALIDATE_TILES_DESC* pDesc    = (DISCARD_INVALIDATE_TILES_DESC*)pData;
-    SWR_CONTEXT*                   pContext = pDC->pContext;
-
-    const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
-
-    for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
-    {
-        if (pDesc->attachmentMask & (1 << i))
-        {
-            HOTTILE* pHotTile =
-                pContext->pHotTileMgr->GetHotTileNoLoad(pContext,
-                                                        pDC,
-                                                        macroTile,
-                                                        (SWR_RENDERTARGET_ATTACHMENT)i,
-                                                        pDesc->createNewTiles,
-                                                        numSamples);
-            if (pHotTile)
-            {
-                HOTTILE_STATE newState = (HOTTILE_STATE)pDesc->newTileState;;
-                if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_CLEAR)
-                {
-                    if (newState == HOTTILE_INVALID)
-                    {
-                        // This is OK for APIs that explicitly allow discards
-                        // (for e.g. depth / stencil data)
-                        //SWR_INVALID("Discarding valid data!");
-                    }
-                }
-                pHotTile->state = newState;
-            }
-        }
-    }
-}
-
-template <uint32_t sampleCountT>
-void BackendNullPS(DRAW_CONTEXT*        pDC,
-                   uint32_t             workerId,
-                   uint32_t             x,
-                   uint32_t             y,
-                   SWR_TRIANGLE_DESC&   work,
-                   RenderOutputBuffers& renderBuffers)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BENullBackend, pDC->drawId);
-    ///@todo: handle center multisample pattern
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
-
-    const API_STATE& state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
-
-    SWR_PS_CONTEXT psContext;
-    // skip SetupPixelShaderContext(&psContext, ...); // not needed here
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
-
-    simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar           dy        = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-            // iterate over active samples
-            unsigned long sample     = 0;
-            uint32_t      sampleMask = state.blendState.sampleMask;
-            while (_BitScanForward(&sample, sampleMask))
-            {
-                sampleMask &= ~(1 << sample);
-
-                simdmask coverageMask = work.coverageMask[sample] & MASK;
-
-                if (coverageMask)
-                {
-                    // offset depth/stencil buffers current sample
-                    uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                    {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
-                                      "Unsupported depth hot tile format");
-
-                        const simdscalar z =
-                            _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
-
-                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                    }
-
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
-
-                    // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
-                    psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));
-
-                    CalcSampleBarycentrics(coeffs, psContext);
-
-                    // interpolate and quantize z
-                    psContext.vZ = vplaneps(coeffs.vZa,
-                                            coeffs.vZb,
-                                            coeffs.vZc,
-                                            psContext.vI.sample,
-                                            psContext.vJ.sample);
-                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
-
-                    // interpolate user clip distance if available
-                    if (state.backendState.clipDistanceMask)
-                    {
-                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
-                                                             work.pUserClipBuffer,
-                                                             psContext.vI.sample,
-                                                             psContext.vJ.sample);
-                    }
-
-                    simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
-                    simdscalar stencilPassMask = vCoverageMask;
-
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
-                    simdscalar depthPassMask = DepthStencilTest(&state,
-                                                                work.triFlags.frontFacing,
-                                                                work.triFlags.viewportIndex,
-                                                                psContext.vZ,
-                                                                pDepthSample,
-                                                                vCoverageMask,
-                                                                pStencilSample,
-                                                                &stencilPassMask);
-                    AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask),
-                                                         _simd_movemask_ps(stencilPassMask),
-                                                         _simd_movemask_ps(vCoverageMask)));
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                      &state.depthStencilState,
-                                      work.triFlags.frontFacing,
-                                      psContext.vZ,
-                                      pDepthSample,
-                                      depthPassMask,
-                                      vCoverageMask,
-                                      pStencilSample,
-                                      stencilPassMask);
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
-
-                    uint32_t statMask  = _simd_movemask_ps(depthPassMask);
-                    uint32_t statCount = _mm_popcnt_u32(statMask);
-                    UPDATE_STAT_BE(DepthPassCount, statCount);
-                }
-
-            Endtile:
-                ATTR_UNUSED;
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer +=
-                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
-        }
-
-        vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BENullBackend, 0);
-}
-
-PFN_CLEAR_TILES  gClearTilesTable[NUM_SWR_FORMATS] = {};
-PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid
-                                     [2]                           // canEarlyZ
-    = {};
-PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
-                                       [SWR_INPUT_COVERAGE_COUNT][2]   // centroid
-                                       [2]                             // forcedSampleCount
-                                       [2]                             // canEarlyZ
-    = {};
-PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT]
-                                        [2] // centroid
-                                        [2] // canEarlyZ
-    = {};
-
-void InitBackendFuncTables()
-{
-    InitBackendPixelRate();
-    InitBackendSingleFuncTable(gBackendSingleSample);
-    InitBackendSampleFuncTable(gBackendSampleRateTable);
-
-    gBackendNullPs[SWR_MULTISAMPLE_1X]  = &BackendNullPS<SWR_MULTISAMPLE_1X>;
-    gBackendNullPs[SWR_MULTISAMPLE_2X]  = &BackendNullPS<SWR_MULTISAMPLE_2X>;
-    gBackendNullPs[SWR_MULTISAMPLE_4X]  = &BackendNullPS<SWR_MULTISAMPLE_4X>;
-    gBackendNullPs[SWR_MULTISAMPLE_8X]  = &BackendNullPS<SWR_MULTISAMPLE_8X>;
-    gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS<SWR_MULTISAMPLE_16X>;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
deleted file mode 100644
index c9eb6c259e3..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.h
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/os.h"
-#include "core/context.h"
-#include "core/multisample.h"
-#include "depthstencil.h"
-#include "rdtsc_core.h"
-
-void ProcessComputeBE(DRAW_CONTEXT* pDC,
-                      uint32_t      workerId,
-                      uint32_t      threadGroupId,
-                      void*&        pSpillFillBuffer,
-                      void*&        pScratchSpace);
-void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
-void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
-void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
-void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
-                                     uint32_t      workerId,
-                                     uint32_t      macroTile,
-                                     void*         pData);
-void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
-
-typedef void (*PFN_CLEAR_TILES)(DRAW_CONTEXT*,
-                                HANDLE                      hWorkerData,
-                                SWR_RENDERTARGET_ATTACHMENT rt,
-                                uint32_t,
-                                uint32_t,
-                                uint32_t[4],
-                                const SWR_RECT& rect);
-
-extern PFN_CLEAR_TILES  gClearTilesTable[NUM_SWR_FORMATS];
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2]     // centroid
-                                            [2];                              // canEarlyZ
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
-                                              [SWR_INPUT_COVERAGE_COUNT][2]   // centroid
-                                              [2]                             // forcedSampleCount
-                                              [2]                             // canEarlyZ
-    ;
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
-                                               [SWR_INPUT_COVERAGE_COUNT][2] // centroid
-                                               [2];                          // canEarlyZ
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
deleted file mode 100644
index e772306faec..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.cpp
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-
-#include <smmintrin.h>
-
-#include "backend.h"
-#include "backend_impl.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-#include "core/multisample.h"
-
-#include <algorithm>
-
-template <SWR_FORMAT format>
-void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value)
-{
-    auto lambda = [&](int32_t comp)
-    {
-        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-
-        pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
-    };
-
-    const uint32_t numIter =
-        (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
-
-    for (uint32_t i = 0; i < numIter; ++i)
-    {
-        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
-    }
-}
-
-template <SWR_FORMAT format>
-INLINE void ClearMacroTile(DRAW_CONTEXT*               pDC,
-                           HANDLE                      hWorkerPrivateData,
-                           SWR_RENDERTARGET_ATTACHMENT rt,
-                           uint32_t                    macroTile,
-                           uint32_t                    renderTargetArrayIndex,
-                           uint32_t                    clear[4],
-                           const SWR_RECT&             rect)
-{
-    // convert clear color to hottile format
-    // clear color is in RGBA float/uint32
-
-    simd16vector vClear;
-    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
-    {
-        simd16scalar vComp = _simd16_load1_ps((const float*)&clear[comp]);
-
-        if (FormatTraits<format>::isNormalized(comp))
-        {
-            vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
-            vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
-        }
-        vComp = FormatTraits<format>::pack(comp, vComp);
-
-        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
-    }
-
-    uint32_t tileX, tileY;
-    MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
-
-    // Init to full macrotile
-    SWR_RECT clearTile = {
-        KNOB_MACROTILE_X_DIM * int32_t(tileX),
-        KNOB_MACROTILE_Y_DIM * int32_t(tileY),
-        KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
-        KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
-    };
-
-    // intersect with clear rect
-    clearTile &= rect;
-
-    // translate to local hottile origin
-    clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM,
-                        -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
-
-    // Make maximums inclusive (needed for convert to raster tiles)
-    clearTile.xmax -= 1;
-    clearTile.ymax -= 1;
-
-    // convert to raster tiles
-    clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
-    clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
-    clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
-    clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
-
-    const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
-    // compute steps between raster tile samples / raster tiles / macro tile rows
-    const uint32_t rasterTileSampleStep =
-        KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
-    const uint32_t rasterTileStep =
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
-    const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
-    const uint32_t pitch            = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
-
-    HOTTILE* pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext,
-                                                               pDC,
-                                                               hWorkerPrivateData,
-                                                               macroTile,
-                                                               rt,
-                                                               true,
-                                                               numSamples,
-                                                               renderTargetArrayIndex);
-    uint32_t rasterTileStartOffset =
-        (ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp>>(
-            pitch, clearTile.xmin, clearTile.ymin)) *
-        numSamples;
-    uint8_t* pRasterTileRow =
-        pHotTile->pBuffer +
-        rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ,
-                               // FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
-
-    // loop over all raster tiles in the current hot tile
-    for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
-    {
-        uint8_t* pRasterTile = pRasterTileRow;
-        for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
-        {
-            for (int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
-            {
-                ClearRasterTile<format>(pRasterTile, vClear);
-                pRasterTile += rasterTileSampleStep;
-            }
-        }
-        pRasterTileRow += macroTileRowStep;
-    }
-
-    pHotTile->state = HOTTILE_DIRTY;
-}
-
-void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
-{
-    SWR_CONTEXT* pContext           = pDC->pContext;
-    HANDLE       hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    if (KNOB_FAST_CLEAR)
-    {
-        CLEAR_DESC*           pClear      = (CLEAR_DESC*)pUserData;
-        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
-        uint32_t              numSamples  = GetNumSamples(sampleCount);
-
-        SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
-
-        RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEClear, pDC->drawId);
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
-        {
-            unsigned long rt   = 0;
-            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
-            while (_BitScanForward(&rt, mask))
-            {
-                mask &= ~(1 << rt);
-
-                HOTTILE* pHotTile =
-                    pContext->pHotTileMgr->GetHotTile(pContext,
-                                                      pDC,
-                                                      hWorkerPrivateData,
-                                                      macroTile,
-                                                      (SWR_RENDERTARGET_ATTACHMENT)rt,
-                                                      true,
-                                                      numSamples,
-                                                      pClear->renderTargetArrayIndex);
-
-                // All we want to do here is to mark the hot tile as being in a "needs clear" state.
-                pHotTile->clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
-                pHotTile->clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
-                pHotTile->clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
-                pHotTile->clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
-                pHotTile->state        = HOTTILE_CLEAR;
-            }
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
-        {
-            HOTTILE* pHotTile      = pContext->pHotTileMgr->GetHotTile(pContext,
-                                                                  pDC,
-                                                                  hWorkerPrivateData,
-                                                                  macroTile,
-                                                                  SWR_ATTACHMENT_DEPTH,
-                                                                  true,
-                                                                  numSamples,
-                                                                  pClear->renderTargetArrayIndex);
-            pHotTile->clearData[0] = *(uint32_t*)&pClear->clearDepth;
-            pHotTile->state        = HOTTILE_CLEAR;
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
-        {
-            HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext,
-                                                                  pDC,
-                                                                  hWorkerPrivateData,
-                                                                  macroTile,
-                                                                  SWR_ATTACHMENT_STENCIL,
-                                                                  true,
-                                                                  numSamples,
-                                                                  pClear->renderTargetArrayIndex);
-
-            pHotTile->clearData[0] = pClear->clearStencil;
-            pHotTile->state        = HOTTILE_CLEAR;
-        }
-
-        RDTSC_END(pDC->pContext->pBucketMgr, BEClear, 1);
-    }
-    else
-    {
-        // Legacy clear
-        CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData;
-        RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEClear, pDC->drawId);
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
-        {
-            uint32_t clearData[4];
-            clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
-            clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
-            clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
-            clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
-
-            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            unsigned long rt   = 0;
-            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
-            while (_BitScanForward(&rt, mask))
-            {
-                mask &= ~(1 << rt);
-
-                pfnClearTiles(pDC,
-                              hWorkerPrivateData,
-                              (SWR_RENDERTARGET_ATTACHMENT)rt,
-                              macroTile,
-                              pClear->renderTargetArrayIndex,
-                              clearData,
-                              pClear->rect);
-            }
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
-        {
-            uint32_t clearData[4];
-            clearData[0]                  = *(uint32_t*)&pClear->clearDepth;
-            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            pfnClearTiles(pDC,
-                          hWorkerPrivateData,
-                          SWR_ATTACHMENT_DEPTH,
-                          macroTile,
-                          pClear->renderTargetArrayIndex,
-                          clearData,
-                          pClear->rect);
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
-        {
-            uint32_t clearData[4];
-            clearData[0]                  = pClear->clearStencil;
-            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
-
-            pfnClearTiles(pDC,
-                          hWorkerPrivateData,
-                          SWR_ATTACHMENT_STENCIL,
-                          macroTile,
-                          pClear->renderTargetArrayIndex,
-                          clearData,
-                          pClear->rect);
-        }
-
-        RDTSC_END(pDC->pContext->pBucketMgr, BEClear, 1);
-    }
-}
-
-void InitClearTilesTable()
-{
-    memset(gClearTilesTable, 0, sizeof(gClearTilesTable));
-
-    gClearTilesTable[R8G8B8A8_UNORM]     = ClearMacroTile<R8G8B8A8_UNORM>;
-    gClearTilesTable[B8G8R8A8_UNORM]     = ClearMacroTile<B8G8R8A8_UNORM>;
-    gClearTilesTable[R32_FLOAT]          = ClearMacroTile<R32_FLOAT>;
-    gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
-    gClearTilesTable[R8_UINT]            = ClearMacroTile<R8_UINT>;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
deleted file mode 100644
index 868419c3e4f..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ /dev/null
@@ -1,1300 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.h
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-#pragma once
-
-#include "tilemgr.h"
-#include "state.h"
-#include "context.h"
-
-
-void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
-void InitBackendSampleFuncTable(
-    PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
-
-static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
-                                          SWR_PS_CONTEXT&          psContext);
-
-
-enum SWR_BACKEND_FUNCS
-{
-    SWR_BACKEND_SINGLE_SAMPLE,
-    SWR_BACKEND_MSAA_PIXEL_RATE,
-    SWR_BACKEND_MSAA_SAMPLE_RATE,
-    SWR_BACKEND_FUNCS_MAX,
-};
-
-#if KNOB_SIMD_WIDTH == 8
-static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
-static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
-static const __m256 vULOffsetsX     = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-static const __m256 vULOffsetsY     = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#define MASK 0xff
-#endif
-
-static INLINE simdmask ComputeUserClipMask(uint8_t           clipMask,
-                                           float*            pUserClipBuffer,
-                                           simdscalar const& vI,
-                                           simdscalar const& vJ)
-{
-    simdscalar vClipMask       = _simd_setzero_ps();
-    uint32_t   numClipDistance = _mm_popcnt_u32(clipMask);
-
-    for (uint32_t i = 0; i < numClipDistance; ++i)
-    {
-        // pull triangle clip distance values from clip buffer
-        simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
-        simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
-        simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
-
-        // interpolate
-        simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
-
-        // clip if interpolated clip distance is < 0 || NAN
-        simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
-
-        vClipMask = _simd_or_ps(vClipMask, vCull);
-    }
-
-    return _simd_movemask_ps(vClipMask);
-}
-
-INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-{
-    static const uint32_t RasterTileColorOffsets[16]{
-        0,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
-            10,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
-            11,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
-            12,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
-            13,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
-            14,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
-            15,
-    };
-    assert(sampleNum < 16);
-    return RasterTileColorOffsets[sampleNum];
-}
-
-INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-{
-    static const uint32_t RasterTileDepthOffsets[16]{
-        0,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
-            10,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
-            11,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
-            12,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
-            13,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
-            14,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
-            15,
-    };
-    assert(sampleNum < 16);
-    return RasterTileDepthOffsets[sampleNum];
-}
-
-INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-{
-    static const uint32_t RasterTileStencilOffsets[16]{
-        0,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            2,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            3,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            4,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            5,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            6,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            7,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            8,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            9,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            10,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            11,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            12,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            13,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            14,
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
-            15,
-    };
-    assert(sampleNum < 16);
-    return RasterTileStencilOffsets[sampleNum];
-}
-
-template <typename T, uint32_t InputCoverage>
-struct generateInputCoverage
-{
-    INLINE generateInputCoverage(const uint64_t* const coverageMask,
-                                 uint32_t (&inputMask)[KNOB_SIMD_WIDTH],
-                                 const uint32_t sampleMask)
-    {
-        // will need to update for avx512
-        assert(KNOB_SIMD_WIDTH == 8);
-
-        simdscalari mask[2];
-        simdscalari sampleCoverage[2];
-
-        if (T::bIsCenterPattern)
-        {
-            // center coverage is the same for all samples; just broadcast to the sample slots
-            uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
-            if (T::MultisampleT::numSamples == 1)
-            {
-                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
-            }
-            else if (T::MultisampleT::numSamples == 2)
-            {
-                sampleCoverage[0] =
-                    _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
-            }
-            else if (T::MultisampleT::numSamples == 4)
-            {
-                sampleCoverage[0] = _simd_set_epi32(
-                    0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
-            }
-            else if (T::MultisampleT::numSamples == 8)
-            {
-                sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
-            }
-            else if (T::MultisampleT::numSamples == 16)
-            {
-                sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
-                sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
-            }
-        }
-        else
-        {
-            simdscalari src    = _simd_set1_epi32(0);
-            simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
-            if (T::MultisampleT::numSamples == 1)
-            {
-                mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
-            }
-            else if (T::MultisampleT::numSamples == 2)
-            {
-                mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-            }
-            else if (T::MultisampleT::numSamples == 4)
-            {
-                mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
-            }
-            else if (T::MultisampleT::numSamples == 8)
-            {
-                mask[0] = _simd_set1_epi32(-1);
-            }
-            else if (T::MultisampleT::numSamples == 16)
-            {
-                mask[0] = _simd_set1_epi32(-1);
-                mask[1] = _simd_set1_epi32(-1);
-                index1  = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
-            }
-
-            // gather coverage for samples 0-7
-            sampleCoverage[0] =
-                _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src),
-                                                            (const float*)coverageMask,
-                                                            index0,
-                                                            _mm256_castsi256_ps(mask[0]),
-                                                            8));
-            if (T::MultisampleT::numSamples > 8)
-            {
-                // gather coverage for samples 8-15
-                sampleCoverage[1] =
-                    _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src),
-                                                                (const float*)coverageMask,
-                                                                index1,
-                                                                _mm256_castsi256_ps(mask[1]),
-                                                                8));
-            }
-        }
-
-        mask[0] = _mm256_set_epi8(-1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  0xC,
-                                  0x8,
-                                  0x4,
-                                  0x0,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  -1,
-                                  0xC,
-                                  0x8,
-                                  0x4,
-                                  0x0);
-        // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
-        simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
-        simdscalari packedCoverage1;
-        if (T::MultisampleT::numSamples > 8)
-        {
-            // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit
-            // lane
-            packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
-        }
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
-        simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
-        simdscalar  shufRes = _mm256_shuffle_ps(
-            _mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-        packedCoverage0 = _mm256_castps_si256(
-            _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
-        simdscalari packedSampleCoverage;
-        if (T::MultisampleT::numSamples > 8)
-        {
-            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-            hiToLow         = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
-            shufRes         = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow),
-                                        _mm256_castsi256_ps(hiToLow),
-                                        _MM_SHUFFLE(1, 1, 0, 1));
-            shufRes         = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
-            packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(
-                _mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
-            packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(
-                _mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
-        }
-        else
-        {
-            packedSampleCoverage = packedCoverage0;
-        }
-#else
-        simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
-        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
-        packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
-        simdscalari packedSampleCoverage;
-        if (T::MultisampleT::numSamples > 8)
-        {
-            permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
-            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-            packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
-            // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
-            packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
-        }
-        else
-        {
-            packedSampleCoverage = packedCoverage0;
-        }
-#endif
-
-        for (int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
-        {
-            // convert packed sample coverage masks into single coverage masks for all samples for
-            // each pixel in the 4x2
-            inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
-            if (!T::bForcedSampleCount)
-            {
-                // input coverage has to be anded with sample mask if MSAA isn't forced on
-                inputMask[i] &= sampleMask;
-            }
-
-            // shift to the next pixel in the 4x2
-            packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
-        }
-    }
-
-    INLINE generateInputCoverage(const uint64_t* const coverageMask,
-                                 simdscalar&           inputCoverage,
-                                 const uint32_t        sampleMask)
-    {
-        uint32_t inputMask[KNOB_SIMD_WIDTH];
-        generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
-        inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7],
-                                                        inputMask[6],
-                                                        inputMask[5],
-                                                        inputMask[4],
-                                                        inputMask[3],
-                                                        inputMask[2],
-                                                        inputMask[1],
-                                                        inputMask[0]));
-    }
-};
-
-template <typename T>
-struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
-{
-    INLINE generateInputCoverage(const uint64_t* const coverageMask,
-                                 simdscalar&           inputCoverage,
-                                 const uint32_t        sampleMask)
-    {
-        // will need to update for avx512
-        assert(KNOB_SIMD_WIDTH == 8);
-        simdscalari       vec = _simd_set1_epi32(coverageMask[0]);
-        const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-        vec                   = _simd_and_si(vec, bit);
-        vec                   = _simd_cmplt_epi32(_simd_setzero_si(), vec);
-        vec                   = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
-        inputCoverage         = _simd_castsi_ps(vec);
-    }
-
-    INLINE generateInputCoverage(const uint64_t* const coverageMask,
-                                 uint32_t (&inputMask)[KNOB_SIMD_WIDTH],
-                                 const uint32_t sampleMask)
-    {
-        uint32_t              simdCoverage     = (coverageMask[0] & MASK);
-        static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
-        for (int i = 0; i < KNOB_SIMD_WIDTH; i++)
-        {
-            // set all samples to covered if conservative coverage mask is set for that pixel
-            inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
-        }
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Centroid behaves exactly as follows :
-// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center
-// (even if the sample pattern does not happen to
-//     have a sample location there).
-// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample
-// index, where sample coverage is after ANDing the
-//     coverage with the SampleMask Rasterizer State.
-// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to
-// fill out 2x2 pixel stamps, the attribute is
-//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the
-//     pixel, then the first sample covered by the SampleMask Rasterizer State is the evaluation
-//     point.Otherwise (full SampleMask), the pixel center is the evaluation point.
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename T>
-INLINE void CalcCentroidPos(SWR_PS_CONTEXT&            psContext,
-                            const SWR_MULTISAMPLE_POS& samplePos,
-                            const uint64_t* const      coverageMask,
-                            const uint32_t             sampleMask,
-                            simdscalar const&          vXSamplePosUL,
-                            simdscalar const&          vYSamplePosUL)
-{
-    uint32_t inputMask[KNOB_SIMD_WIDTH];
-    generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
-
-    // Case (2) - partially covered pixel
-
-    // scan for first covered sample per pixel in the 4x2 span
-    unsigned long sampleNum[KNOB_SIMD_WIDTH];
-    (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
-    (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
-    (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
-    (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
-    (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
-    (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
-    (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
-    (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
-
-    // look up and set the sample offsets from UL pixel corner for first covered sample
-    simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
-                                       samplePos.X(sampleNum[6]),
-                                       samplePos.X(sampleNum[5]),
-                                       samplePos.X(sampleNum[4]),
-                                       samplePos.X(sampleNum[3]),
-                                       samplePos.X(sampleNum[2]),
-                                       samplePos.X(sampleNum[1]),
-                                       samplePos.X(sampleNum[0]));
-
-    simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
-                                       samplePos.Y(sampleNum[6]),
-                                       samplePos.Y(sampleNum[5]),
-                                       samplePos.Y(sampleNum[4]),
-                                       samplePos.Y(sampleNum[3]),
-                                       samplePos.Y(sampleNum[2]),
-                                       samplePos.Y(sampleNum[1]),
-                                       samplePos.Y(sampleNum[0]));
-    // add sample offset to UL pixel corner
-    vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
-    vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
-
-    // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
-    static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
-    simdscalari              vInputCoveragei   = _simd_set_epi32(inputMask[7],
-                                                  inputMask[6],
-                                                  inputMask[5],
-                                                  inputMask[4],
-                                                  inputMask[3],
-                                                  inputMask[2],
-                                                  inputMask[1],
-                                                  inputMask[0]);
-    simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
-
-    static const simdscalari vZero = _simd_setzero_si();
-    const simdscalari vSampleMask  = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
-    simdscalari       vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
-    simdscalari       vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
-    simdscalari       vCase3b           = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
-
-    simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
-
-    // set the centroid position based on results from above
-    psContext.vX.centroid =
-        _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
-    psContext.vY.centroid =
-        _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
-
-    // Case (3a) No samples covered and partial sample mask
-    simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
-    // sample mask should never be all 0's for this case, but handle it anyways
-    unsigned long firstCoveredSampleMaskSample = 0;
-    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask))
-                     : (firstCoveredSampleMaskSample = 0);
-
-    simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
-
-    vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
-    vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
-
-    // blend in case 3a pixel locations
-    psContext.vX.centroid =
-        _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
-    psContext.vY.centroid =
-        _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
-}
-
-INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs,
-                                     SWR_PS_CONTEXT&          psContext,
-                                     const simdscalar&        vXSamplePosUL,
-                                     const simdscalar&        vYSamplePosUL)
-{
-    // evaluate I,J
-    psContext.vI.centroid =
-        vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
-    psContext.vJ.centroid =
-        vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
-    psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
-    psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
-
-    // interpolate 1/w
-    psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW,
-                                            coeffs.vBOneOverW,
-                                            coeffs.vCOneOverW,
-                                            psContext.vI.centroid,
-                                            psContext.vJ.centroid);
-}
-
-INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const& z, float minz, float maxz)
-{
-    const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
-    const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
-
-    return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
-}
-
-template <typename T>
-INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
-{
-    // RT has to be single sample if we're in forcedMSAA mode
-    if (T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
-    {
-        return 1;
-    }
-    // unless we're forced to single sample, in which case we run the OM at the sample count of the
-    // RT
-    else if (T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
-    {
-        return GetNumSamples(blendSampleCount);
-    }
-    // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
-    else
-    {
-        return T::MultisampleT::numSamples;
-    }
-}
-
-inline void SetupBarycentricCoeffs(BarycentricCoeffs* coeffs, const SWR_TRIANGLE_DESC& work)
-{
-    // broadcast scalars
-
-    coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
-    coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
-    coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
-
-    coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
-    coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
-    coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
-
-    coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
-    coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
-    coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
-
-    coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
-
-    coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
-    coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
-    coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
-}
-
-inline void SetupRenderBuffers(uint8_t*             pColorBuffer[SWR_NUM_RENDERTARGETS],
-                               uint8_t**            pDepthBuffer,
-                               uint8_t**            pStencilBuffer,
-                               uint32_t             colorHotTileMask,
-                               RenderOutputBuffers& renderBuffers)
-{
-    unsigned long index;
-    while (_BitScanForward(&index, colorHotTileMask))
-    {
-        assert(index < SWR_NUM_RENDERTARGETS);
-        colorHotTileMask &= ~(1 << index);
-        pColorBuffer[index] = renderBuffers.pColor[index];
-    }
-
-    if (pDepthBuffer)
-    {
-        *pDepthBuffer = renderBuffers.pDepth;
-    }
-
-    if (pStencilBuffer)
-    {
-        *pStencilBuffer = renderBuffers.pStencil;
-        ;
-    }
-}
-
-INLINE void SetRenderHotTilesDirty(DRAW_CONTEXT* pDC, RenderOutputBuffers& renderBuffers)
-{
-    const API_STATE& state = GetApiState(pDC);
-
-    unsigned long rtSlot                 = 0;
-    uint32_t      colorHottileEnableMask = state.colorHottileEnable;
-    while (_BitScanForward(&rtSlot, colorHottileEnableMask))
-    {
-        colorHottileEnableMask &= ~(1 << rtSlot);
-        renderBuffers.pColorHotTile[rtSlot]->state = HOTTILE_DIRTY;
-    }
-}
-
-template <typename T>
-void SetupPixelShaderContext(SWR_PS_CONTEXT*            psContext,
-                             const SWR_MULTISAMPLE_POS& samplePos,
-                             SWR_TRIANGLE_DESC&         work)
-{
-    psContext->pAttribs               = work.pAttribs;
-    psContext->pPerspAttribs          = work.pPerspAttribs;
-    psContext->frontFace              = work.triFlags.frontFacing;
-    psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
-    psContext->viewportIndex          = work.triFlags.viewportIndex;
-
-    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull
-    // attribs
-    psContext->I = work.I;
-    psContext->J = work.J;
-
-    psContext->recipDet = work.recipDet;
-    psContext->pRecipW  = work.pRecipW;
-    psContext->pSamplePosX =
-        samplePos.X(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
-    psContext->pSamplePosY =
-        samplePos.Y(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
-    psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
-    psContext->sampleIndex           = 0;
-}
-
-template <typename T, bool IsSingleSample>
-void CalcCentroid(SWR_PS_CONTEXT*            psContext,
-                  const SWR_MULTISAMPLE_POS& samplePos,
-                  const BarycentricCoeffs&   coeffs,
-                  const uint64_t* const      coverageMask,
-                  uint32_t                   sampleMask)
-{
-    if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid
-                        // positions are still different
-    {
-        // for 1x case, centroid is pixel center
-        psContext->vX.centroid        = psContext->vX.center;
-        psContext->vY.centroid        = psContext->vY.center;
-        psContext->vI.centroid        = psContext->vI.center;
-        psContext->vJ.centroid        = psContext->vJ.center;
-        psContext->vOneOverW.centroid = psContext->vOneOverW.center;
-    }
-    else
-    {
-        if (T::bCentroidPos)
-        {
-            ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
-            if (T::bIsCenterPattern)
-            {
-                psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
-                psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
-            }
-            else
-            {
-                // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate
-                // coverage 2X'..
-                CalcCentroidPos<T>(*psContext,
-                                   samplePos,
-                                   coverageMask,
-                                   sampleMask,
-                                   psContext->vX.UL,
-                                   psContext->vY.UL);
-            }
-
-            CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
-        }
-        else
-        {
-            psContext->vX.centroid = psContext->vX.sample;
-            psContext->vY.centroid = psContext->vY.sample;
-        }
-    }
-}
-
-template <typename T>
-struct PixelRateZTestLoop
-{
-    PixelRateZTestLoop(DRAW_CONTEXT*            DC,
-                       uint32_t                 _workerId,
-                       const SWR_TRIANGLE_DESC& Work,
-                       const BarycentricCoeffs& Coeffs,
-                       const API_STATE&         apiState,
-                       uint8_t*&                depthBuffer,
-                       uint8_t*&                stencilBuffer,
-                       const uint8_t            ClipDistanceMask) :
-        pDC(DC),
-        workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
-        samplePos(state.rastState.samplePositions), clipDistanceMask(ClipDistanceMask),
-        pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
-
-    INLINE
-    uint32_t operator()(simdscalar&        activeLanes,
-                        SWR_PS_CONTEXT&    psContext,
-                        const CORE_BUCKETS BEDepthBucket,
-                        uint32_t           currentSimdIn8x8 = 0)
-    {
-
-        uint32_t   statCount            = 0;
-        simdscalar anyDepthSamplePassed = _simd_setzero_ps();
-        for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
-        {
-            const uint8_t* pCoverageMask = (uint8_t*)&work.coverageMask[sample];
-            vCoverageMask[sample] =
-                _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
-
-            if (!_simd_movemask_ps(vCoverageMask[sample]))
-            {
-                vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] =
-                    _simd_setzero_ps();
-                continue;
-            }
-
-            // offset depth/stencil buffers current sample
-            uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
-            uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-            if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-            {
-                static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
-                              "Unsupported depth hot tile format");
-
-                const simdscalar z = _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
-
-                const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                vCoverageMask[sample] =
-                    _simd_and_ps(vCoverageMask[sample],
-                                 _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
-            }
-
-            RDTSC_BEGIN(psContext.pBucketManager, BEBarycentric, pDC->drawId);
-
-            // calculate per sample positions
-            psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
-            psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
-
-            // calc I & J per sample
-            CalcSampleBarycentrics(coeffs, psContext);
-
-            if (psState.writesODepth)
-            {
-                {
-                    // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
-                    vZ[sample] = psContext.vZ;
-                }
-            }
-            else
-            {
-                vZ[sample] = vplaneps(
-                    coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
-                vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
-            }
-
-            RDTSC_END(psContext.pBucketManager, BEBarycentric, 0);
-
-            ///@todo: perspective correct vs non-perspective correct clipping?
-            // if clip distances are enabled, we need to interpolate for each sample
-            if (clipDistanceMask)
-            {
-                uint8_t clipMask = ComputeUserClipMask(clipDistanceMask,
-                                                       work.pUserClipBuffer,
-                                                       psContext.vI.sample,
-                                                       psContext.vJ.sample);
-
-                vCoverageMask[sample] =
-                    _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
-            }
-
-            // ZTest for this sample
-            ///@todo Need to uncomment out this bucket.
-            // RDTSC_BEGIN(psContext.pBucketManager, BEDepthBucket, pDC->drawId);
-            depthPassMask[sample]   = vCoverageMask[sample];
-            stencilPassMask[sample] = vCoverageMask[sample];
-            depthPassMask[sample]   = DepthStencilTest(&state,
-                                                     work.triFlags.frontFacing,
-                                                     work.triFlags.viewportIndex,
-                                                     vZ[sample],
-                                                     pDepthSample,
-                                                     vCoverageMask[sample],
-                                                     pStencilSample,
-                                                     &stencilPassMask[sample]);
-            // RDTSC_END(psContext.pBucketManager, BEDepthBucket, 0);
-
-            // early-exit if no pixels passed depth or earlyZ is forced on
-            if (psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
-            {
-                DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                  &state.depthStencilState,
-                                  work.triFlags.frontFacing,
-                                  vZ[sample],
-                                  pDepthSample,
-                                  depthPassMask[sample],
-                                  vCoverageMask[sample],
-                                  pStencilSample,
-                                  stencilPassMask[sample]);
-
-                if (!_simd_movemask_ps(depthPassMask[sample]))
-                {
-                    continue;
-                }
-            }
-            anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
-            uint32_t statMask    = _simd_movemask_ps(depthPassMask[sample]);
-            statCount += _mm_popcnt_u32(statMask);
-        }
-
-        activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
-        // return number of samples that passed depth and coverage
-        return statCount;
-    }
-
-    // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
-    simdscalar vZ[T::MultisampleT::numCoverageSamples];
-    simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
-    simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
-    simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
-
-private:
-    // functor inputs
-    DRAW_CONTEXT* pDC;
-    uint32_t      workerId;
-
-    const SWR_TRIANGLE_DESC&   work;
-    const BarycentricCoeffs&   coeffs;
-    const API_STATE&           state;
-    const SWR_PS_STATE&        psState;
-    const SWR_MULTISAMPLE_POS& samplePos;
-    const uint8_t              clipDistanceMask;
-    uint8_t*&                  pDepthBuffer;
-    uint8_t*&                  pStencilBuffer;
-};
-
-INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT& psContext)
-{
-    // evaluate I,J
-    psContext.vI.center =
-        vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
-    psContext.vJ.center =
-        vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
-    psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
-    psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
-
-    // interpolate 1/w
-    psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW,
-                                          coeffs.vBOneOverW,
-                                          coeffs.vCOneOverW,
-                                          psContext.vI.center,
-                                          psContext.vJ.center);
-}
-
-static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
-                                          SWR_PS_CONTEXT&          psContext)
-{
-    // evaluate I,J
-    psContext.vI.sample =
-        vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
-    psContext.vJ.sample =
-        vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
-    psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
-    psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
-
-    // interpolate 1/w
-    psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW,
-                                          coeffs.vBOneOverW,
-                                          coeffs.vCOneOverW,
-                                          psContext.vI.sample,
-                                          psContext.vJ.sample);
-}
-
-// Merge Output to 8x2 SIMD16 Tile Format
-INLINE void OutputMerger8x2(DRAW_CONTEXT*   pDC,
-                            SWR_PS_CONTEXT& psContext,
-                            uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS],
-                            uint32_t               sample,
-                            const SWR_BLEND_STATE* pBlendState,
-                            const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS],
-                            simdscalar&       coverageMask,
-                            simdscalar const& depthPassMask,
-                            uint32_t          renderTargetMask,
-                            bool              useAlternateOffset,
-                            uint32_t          workerId)
-{
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
-
-    if (useAlternateOffset)
-    {
-        rasterTileColorOffset += sizeof(simdscalar);
-    }
-
-    simdvector blendSrc;
-    simdvector blendOut;
-
-    unsigned long rt;
-    while (_BitScanForward(&rt, renderTargetMask))
-    {
-        renderTargetMask &= ~(1 << rt);
-
-        const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt];
-
-        simdscalar* pColorSample;
-        bool        hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed ||
-                             !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
-        if (hotTileEnable)
-        {
-            pColorSample = reinterpret_cast<simdscalar*>(pColorBase[rt] + rasterTileColorOffset);
-            blendSrc[0]  = pColorSample[0];
-            blendSrc[1]  = pColorSample[2];
-            blendSrc[2]  = pColorSample[4];
-            blendSrc[3]  = pColorSample[6];
-        }
-        else
-        {
-            pColorSample = nullptr;
-        }
-
-        SWR_BLEND_CONTEXT blendContext = {0};
-        {
-            // pfnBlendFunc may not update all channels.  Initialize with PS output.
-            /// TODO: move this into the blend JIT.
-            blendOut = psContext.shaded[rt];
-
-            blendContext.pBlendState = pBlendState;
-            blendContext.src         = &psContext.shaded[rt];
-            blendContext.src1        = &psContext.shaded[1];
-            blendContext.src0alpha   = reinterpret_cast<simdvector*>(&psContext.shaded[0].w);
-            blendContext.sampleNum   = sample;
-            blendContext.pDst        = &blendSrc;
-            blendContext.result      = &blendOut;
-            blendContext.oMask       = &psContext.oMask;
-            blendContext.pMask       = reinterpret_cast<simdscalari*>(&coverageMask);
-
-            // Blend outputs and update coverage mask for alpha test
-            if (pfnBlendFunc[rt] != nullptr)
-            {
-                pfnBlendFunc[rt](&blendContext);
-            }
-        }
-
-        // Track alpha events
-        AR_EVENT(
-            AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended));
-
-        // final write mask
-        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
-
-        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
-                      "Unsupported hot tile format");
-
-        // store with color mask
-        if (!pRTBlend->writeDisableRed)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[0]), outputMask, blendOut.x);
-        }
-        if (!pRTBlend->writeDisableGreen)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[2]), outputMask, blendOut.y);
-        }
-        if (!pRTBlend->writeDisableBlue)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[4]), outputMask, blendOut.z);
-        }
-        if (!pRTBlend->writeDisableAlpha)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[6]), outputMask, blendOut.w);
-        }
-    }
-}
-
-template <typename T>
-void BackendPixelRate(DRAW_CONTEXT*        pDC,
-                      uint32_t             workerId,
-                      uint32_t             x,
-                      uint32_t             y,
-                      SWR_TRIANGLE_DESC&   work,
-                      RenderOutputBuffers& renderBuffers)
-{
-    ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the
-    /// backend
-
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelRateBackend, pDC->drawId);
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
-
-    const API_STATE& state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    SWR_CONTEXT* pContext    = pDC->pContext;
-    void*        pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    SWR_PS_CONTEXT             psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer,
-                       &pDepthBuffer,
-                       &pStencilBuffer,
-                       state.colorHottileEnable,
-                       renderBuffers);
-
-    bool isTileDirty = false;
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
-
-    PixelRateZTestLoop<T> PixelRateZTest(pDC,
-                                         workerId,
-                                         work,
-                                         coeffs,
-                                         state,
-                                         pDepthBuffer,
-                                         pStencilBuffer,
-                                         state.backendState.clipDistanceMask);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-
-            simdscalar activeLanes;
-            if (!(work.anyCoveredSamples & MASK))
-            {
-                goto Endtile;
-            };
-            activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK);
-
-            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-            {
-                const uint64_t* pCoverageMask =
-                    (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-                        ? &work.innerCoverageMask
-                        : &work.coverageMask[0];
-
-                generateInputCoverage<T, T::InputCoverage>(
-                    pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-            }
-
-            RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
-
-            CalcPixelBarycentrics(coeffs, psContext);
-
-            CalcCentroid<T, false>(
-                &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-            RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
-
-            if (T::bForcedSampleCount)
-            {
-                // candidate pixels (that passed coverage) will cause shader invocation if any bits
-                // in the samplemask are set
-                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(
-                    _simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
-                activeLanes                  = _simd_and_ps(activeLanes, vSampleMask);
-            }
-
-            // Early-Z?
-            if (T::bCanEarlyZ && !T::bForcedSampleCount)
-            {
-                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
-                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
-                AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
-            }
-
-            // if we have no covered samples that passed depth at this point, go to next tile
-            if (!_simd_movemask_ps(activeLanes))
-            {
-                goto Endtile;
-            };
-
-            if (state.psState.usesSourceDepth)
-            {
-                RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
-                // interpolate and quantize z
-                psContext.vZ = vplaneps(
-                    coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-                RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
-            }
-
-            // pixels that are currently active
-            psContext.activeMask = _simd_castps_si(activeLanes);
-            psContext.oMask      = T::MultisampleT::FullSampleMask();
-
-            // execute pixel shader
-            RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
-            state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
-            RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
-
-            // update stats
-            UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
-            AR_EVENT(PSStats((HANDLE)&psContext.stats));
-
-            // update active lanes to remove any discarded or oMask'd pixels
-            activeLanes = _simd_castsi_ps(_simd_and_si(
-                psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
-            if (!_simd_movemask_ps(activeLanes))
-            {
-                goto Endtile;
-            };
-
-            isTileDirty = true;
-
-            // late-Z
-            if (!T::bCanEarlyZ && !T::bForcedSampleCount)
-            {
-                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
-                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
-                AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
-            }
-
-            // if we have no covered samples that passed depth at this point, skip OM and go to next
-            // tile
-            if (!_simd_movemask_ps(activeLanes))
-            {
-                goto Endtile;
-            };
-
-            // output merger
-            // loop over all samples, broadcasting the results of the PS to all passing pixels
-            for (uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount);
-                 sample++)
-            {
-                RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
-                // center pattern does a single coverage/depth/stencil test, standard pattern tests
-                // all samples
-                uint32_t   coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
-                simdscalar coverageMask, depthMask;
-                if (T::bForcedSampleCount)
-                {
-                    coverageMask = depthMask = activeLanes;
-                }
-                else
-                {
-                    coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
-                    depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
-                    if (!_simd_movemask_ps(depthMask))
-                    {
-                        // stencil should already have been written in early/lateZ tests
-                        RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
-                        continue;
-                    }
-                }
-
-                // broadcast the results of the PS to all passing pixels
-
-                OutputMerger8x2(pDC,
-                                psContext,
-                                psContext.pColorBuffer,
-                                sample,
-                                &state.blendState,
-                                state.pfnBlendFunc,
-                                coverageMask,
-                                depthMask,
-                                state.psState.renderTargetMask,
-                                useAlternateOffset,
-                                workerId);
-
-
-                if (!state.psState.forceEarlyZ && !T::bForcedSampleCount)
-                {
-                    uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                      &state.depthStencilState,
-                                      work.triFlags.frontFacing,
-                                      PixelRateZTest.vZ[coverageSampleNum],
-                                      pDepthSample,
-                                      depthMask,
-                                      coverageMask,
-                                      pStencilSample,
-                                      PixelRateZTest.stencilPassMask[coverageSampleNum]);
-                }
-                RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
-            }
-        Endtile:
-            RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
-
-            for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
-            {
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-            work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-
-            if (useAlternateOffset)
-            {
-                unsigned long rt;
-                uint32_t rtMask = state.colorHottileEnable;
-                while (_BitScanForward(&rt, rtMask))
-                {
-                    rtMask &= ~(1 << rt);
-                    psContext.pColorBuffer[rt] +=
-                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer +=
-                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    if (isTileDirty)
-    {
-        SetRenderHotTilesDirty(pDC, renderBuffers);
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BEPixelRateBackend, 0);
-}
-
-template <uint32_t sampleCountT = SWR_MULTISAMPLE_1X,
-          uint32_t isCenter     = 0,
-          uint32_t coverage     = 0,
-          uint32_t centroid     = 0,
-          uint32_t forced       = 0,
-          uint32_t canEarlyZ    = 0
-          >
-struct SwrBackendTraits
-{
-    static const bool     bIsCenterPattern   = (isCenter == 1);
-    static const uint32_t InputCoverage      = coverage;
-    static const bool     bCentroidPos       = (centroid == 1);
-    static const bool     bForcedSampleCount = (forced == 1);
-    static const bool     bCanEarlyZ         = (canEarlyZ == 1);
-    typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
deleted file mode 100644
index 7881d36ddb9..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
+++ /dev/null
@@ -1,454 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.cpp
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-
-#include <smmintrin.h>
-
-#include "backend.h"
-#include "backend_impl.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-#include "core/multisample.h"
-
-#include <algorithm>
-
-template <typename T>
-void BackendSampleRate(DRAW_CONTEXT*        pDC,
-                       uint32_t             workerId,
-                       uint32_t             x,
-                       uint32_t             y,
-                       SWR_TRIANGLE_DESC&   work,
-                       RenderOutputBuffers& renderBuffers)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESampleRateBackend, pDC->drawId);
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
-
-    void* pWorkerData      = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-    const API_STATE& state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    SWR_PS_CONTEXT             psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer,
-                       &pDepthBuffer,
-                       &pStencilBuffer,
-                       state.colorHottileEnable,
-                       renderBuffers);
-
-    bool isTileDirty = false;
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-
-            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-            {
-                const uint64_t* pCoverageMask =
-                    (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-                        ? &work.innerCoverageMask
-                        : &work.coverageMask[0];
-
-                generateInputCoverage<T, T::InputCoverage>(
-                    pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-            }
-
-            RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
-
-            CalcPixelBarycentrics(coeffs, psContext);
-
-            CalcCentroid<T, false>(
-                &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-            RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
-
-            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
-            {
-                simdmask coverageMask = work.coverageMask[sample] & MASK;
-
-                if (coverageMask)
-                {
-                    // offset depth/stencil buffers current sample
-                    uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                    {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
-                                      "Unsupported depth hot tile format");
-
-                        const simdscalar z =
-                            _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
-
-                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                    }
-
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
-
-                    // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
-                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
-
-                    CalcSampleBarycentrics(coeffs, psContext);
-
-                    // interpolate and quantize z
-                    psContext.vZ = vplaneps(coeffs.vZa,
-                                            coeffs.vZb,
-                                            coeffs.vZc,
-                                            psContext.vI.sample,
-                                            psContext.vJ.sample);
-                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
-
-                    // interpolate user clip distance if available
-                    if (state.backendState.clipDistanceMask)
-                    {
-                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
-                                                             work.pUserClipBuffer,
-                                                             psContext.vI.sample,
-                                                             psContext.vJ.sample);
-                    }
-
-                    simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
-                    simdscalar depthPassMask   = vCoverageMask;
-                    simdscalar stencilPassMask = vCoverageMask;
-
-                    // Early-Z?
-                    if (T::bCanEarlyZ)
-                    {
-                        RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state,
-                                                         work.triFlags.frontFacing,
-                                                         work.triFlags.viewportIndex,
-                                                         psContext.vZ,
-                                                         pDepthSample,
-                                                         vCoverageMask,
-                                                         pStencilSample,
-                                                         &stencilPassMask);
-                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
-                                                                 _simd_movemask_ps(stencilPassMask),
-                                                                 _simd_movemask_ps(vCoverageMask)));
-                        RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
-
-                        // early-exit if no samples passed depth or earlyZ is forced on.
-                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
-                        {
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                              &state.depthStencilState,
-                                              work.triFlags.frontFacing,
-                                              psContext.vZ,
-                                              pDepthSample,
-                                              depthPassMask,
-                                              vCoverageMask,
-                                              pStencilSample,
-                                              stencilPassMask);
-
-                            if (!_simd_movemask_ps(depthPassMask))
-                            {
-                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-                                continue;
-                            }
-                        }
-                    }
-
-                    psContext.sampleIndex = sample;
-                    psContext.activeMask  = _simd_castps_si(vCoverageMask);
-
-                    // execute pixel shader
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
-                    state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
-
-                    // update stats
-                    UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
-                    AR_EVENT(PSStats((HANDLE)&psContext.stats));
-
-                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
-                    if (_simd_movemask_ps(vCoverageMask))
-                    {
-                        isTileDirty = true;
-                    }
-
-                    // late-Z
-                    if (!T::bCanEarlyZ)
-                    {
-                        RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state,
-                                                         work.triFlags.frontFacing,
-                                                         work.triFlags.viewportIndex,
-                                                         psContext.vZ,
-                                                         pDepthSample,
-                                                         vCoverageMask,
-                                                         pStencilSample,
-                                                         &stencilPassMask);
-                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
-                                                                _simd_movemask_ps(stencilPassMask),
-                                                                _simd_movemask_ps(vCoverageMask)));
-                        RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
-
-                        if (!_simd_movemask_ps(depthPassMask))
-                        {
-                            // need to call depth/stencil write for stencil write
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                              &state.depthStencilState,
-                                              work.triFlags.frontFacing,
-                                              psContext.vZ,
-                                              pDepthSample,
-                                              depthPassMask,
-                                              vCoverageMask,
-                                              pStencilSample,
-                                              stencilPassMask);
-
-                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-                            continue;
-                        }
-                    }
-
-                    uint32_t statMask  = _simd_movemask_ps(depthPassMask);
-                    uint32_t statCount = _mm_popcnt_u32(statMask);
-                    UPDATE_STAT_BE(DepthPassCount, statCount);
-
-                    // output merger
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
-
-                    OutputMerger8x2(pDC,
-                                    psContext,
-                                    psContext.pColorBuffer,
-                                    sample,
-                                    &state.blendState,
-                                    state.pfnBlendFunc,
-                                    vCoverageMask,
-                                    depthPassMask,
-                                    state.psState.renderTargetMask,
-                                    useAlternateOffset,
-                                    workerId);
-
-                    // do final depth write after all pixel kills
-                    if (!state.psState.forceEarlyZ)
-                    {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                          &state.depthStencilState,
-                                          work.triFlags.frontFacing,
-                                          psContext.vZ,
-                                          pDepthSample,
-                                          depthPassMask,
-                                          vCoverageMask,
-                                          pStencilSample,
-                                          stencilPassMask);
-                    }
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
-                }
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-        Endtile:
-            ATTR_UNUSED;
-
-            RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
-
-            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-            if (useAlternateOffset)
-            {
-                unsigned long rt;
-                uint32_t rtMask = state.colorHottileEnable;
-                while (_BitScanForward(&rt, rtMask))
-                {
-                    rtMask &= ~(1 << rt);
-                    psContext.pColorBuffer[rt] +=
-                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer +=
-                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    if (isTileDirty)
-    {
-        SetRenderHotTilesDirty(pDC, renderBuffers);
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESampleRateBackend, 0);
-}
-
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BEChooserSampleRate
-{
-    // Last Arg Terminator
-    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
-    {
-        switch (tArg)
-        {
-        case SWR_BACKEND_MSAA_SAMPLE_RATE:
-            return BackendSampleRate<SwrBackendTraits<ArgsT...>>;
-            break;
-        case SWR_BACKEND_SINGLE_SAMPLE:
-        case SWR_BACKEND_MSAA_PIXEL_RATE:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
-    {
-        switch (tArg)
-        {
-        case SWR_INPUT_COVERAGE_NONE:
-            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
-                remainingArgs...);
-            break;
-        case SWR_INPUT_COVERAGE_NORMAL:
-            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
-                remainingArgs...);
-            break;
-        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
-            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
-                remainingArgs...);
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid sample pattern\n");
-            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
-                remainingArgs...);
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
-    {
-        switch (tArg)
-        {
-        case SWR_MULTISAMPLE_1X:
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_2X:
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_4X:
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_8X:
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_16X:
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid sample count\n");
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
-    {
-        if (tArg == true)
-        {
-            return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
-        }
-
-        return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
-    }
-};
-
-void InitBackendSampleFuncTable(
-    PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
-{
-    for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT;
-         sampleCount++)
-    {
-        for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-        {
-            for (uint32_t centroid = 0; centroid < 2; centroid++)
-            {
-                for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-                {
-                    table[sampleCount][inputCoverage][centroid][canEarlyZ] =
-                        BEChooserSampleRate<>::GetFunc(
-                            (SWR_MULTISAMPLE_COUNT)sampleCount,
-                            false,
-                            (SWR_INPUT_COVERAGE)inputCoverage,
-                            (centroid > 0),
-                            false,
-                            (canEarlyZ > 0),
-                            (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
-                }
-            }
-        }
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
deleted file mode 100644
index 06f78c4b88a..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
+++ /dev/null
@@ -1,428 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.cpp
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-
-#include <smmintrin.h>
-
-#include "backend.h"
-#include "backend_impl.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-#include "core/multisample.h"
-
-#include <algorithm>
-
-template <typename T>
-void BackendSingleSample(DRAW_CONTEXT*        pDC,
-                         uint32_t             workerId,
-                         uint32_t             x,
-                         uint32_t             y,
-                         SWR_TRIANGLE_DESC&   work,
-                         RenderOutputBuffers& renderBuffers)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESingleSampleBackend, pDC->drawId);
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
-
-    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    const API_STATE& state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    SWR_PS_CONTEXT             psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer,
-                       &pDepthBuffer,
-                       &pStencilBuffer,
-                       state.colorHottileEnable,
-                       renderBuffers);
-
-    // Indicates backend rendered something to the color buffer
-    bool isTileDirty = false;
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 1);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-
-            simdmask coverageMask = work.coverageMask[0] & MASK;
-
-            if (coverageMask)
-            {
-                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                {
-                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
-                                  "Unsupported depth hot tile format");
-
-                    const simdscalar z =
-                        _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));
-
-                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                }
-
-                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-                {
-                    const uint64_t* pCoverageMask =
-                        (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-                            ? &work.innerCoverageMask
-                            : &work.coverageMask[0];
-
-                    generateInputCoverage<T, T::InputCoverage>(
-                        pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-                }
-
-                RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
-
-                CalcPixelBarycentrics(coeffs, psContext);
-
-                CalcCentroid<T, true>(
-                    &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-                // interpolate and quantize z
-                psContext.vZ = vplaneps(
-                    coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 1);
-
-                // interpolate user clip distance if available
-                if (state.backendState.clipDistanceMask)
-                {
-                    coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
-                                                         work.pUserClipBuffer,
-                                                         psContext.vI.center,
-                                                         psContext.vJ.center);
-                }
-
-                simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
-                simdscalar depthPassMask   = vCoverageMask;
-                simdscalar stencilPassMask = vCoverageMask;
-
-                // Early-Z?
-                if (T::bCanEarlyZ)
-                {
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state,
-                                                     work.triFlags.frontFacing,
-                                                     work.triFlags.viewportIndex,
-                                                     psContext.vZ,
-                                                     pDepthBuffer,
-                                                     vCoverageMask,
-                                                     pStencilBuffer,
-                                                     &stencilPassMask);
-                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
-                                                               _simd_movemask_ps(stencilPassMask),
-                                                               _simd_movemask_ps(vCoverageMask)));
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
-
-                    // early-exit if no pixels passed depth or earlyZ is forced on
-                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
-                    {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                          &state.depthStencilState,
-                                          work.triFlags.frontFacing,
-                                          psContext.vZ,
-                                          pDepthBuffer,
-                                          depthPassMask,
-                                          vCoverageMask,
-                                          pStencilBuffer,
-                                          stencilPassMask);
-
-                        if (!_simd_movemask_ps(depthPassMask))
-                        {
-                            goto Endtile;
-                        }
-                    }
-                }
-
-                psContext.sampleIndex = 0;
-                psContext.activeMask  = _simd_castps_si(vCoverageMask);
-
-                // execute pixel shader
-                RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
-                state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
-                RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
-
-                // update stats
-                UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
-                AR_EVENT(PSStats((HANDLE)&psContext.stats));
-
-                vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
-                if (_simd_movemask_ps(vCoverageMask))
-                {
-                    isTileDirty = true;
-                }
-
-                // late-Z
-                if (!T::bCanEarlyZ)
-                {
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state,
-                                                     work.triFlags.frontFacing,
-                                                     work.triFlags.viewportIndex,
-                                                     psContext.vZ,
-                                                     pDepthBuffer,
-                                                     vCoverageMask,
-                                                     pStencilBuffer,
-                                                     &stencilPassMask);
-                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
-                                                              _simd_movemask_ps(stencilPassMask),
-                                                              _simd_movemask_ps(vCoverageMask)));
-                    RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
-
-                    if (!_simd_movemask_ps(depthPassMask))
-                    {
-                        // need to call depth/stencil write for stencil write
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                          &state.depthStencilState,
-                                          work.triFlags.frontFacing,
-                                          psContext.vZ,
-                                          pDepthBuffer,
-                                          depthPassMask,
-                                          vCoverageMask,
-                                          pStencilBuffer,
-                                          stencilPassMask);
-                        goto Endtile;
-                    }
-                }
-                else
-                {
-                    // for early z, consolidate discards from shader
-                    // into depthPassMask
-                    depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
-                }
-
-                uint32_t statMask  = _simd_movemask_ps(depthPassMask);
-                uint32_t statCount = _mm_popcnt_u32(statMask);
-                UPDATE_STAT_BE(DepthPassCount, statCount);
-
-                // output merger
-                RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
-
-                OutputMerger8x2(pDC,
-                                psContext,
-                                psContext.pColorBuffer,
-                                0,
-                                &state.blendState,
-                                state.pfnBlendFunc,
-                                vCoverageMask,
-                                depthPassMask,
-                                state.psState.renderTargetMask,
-                                useAlternateOffset,
-                                workerId);
-
-                // do final depth write after all pixel kills
-                if (!state.psState.forceEarlyZ)
-                {
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                      &state.depthStencilState,
-                                      work.triFlags.frontFacing,
-                                      psContext.vZ,
-                                      pDepthBuffer,
-                                      depthPassMask,
-                                      vCoverageMask,
-                                      pStencilBuffer,
-                                      stencilPassMask);
-                }
-                RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
-            }
-
-        Endtile:
-            RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
-
-            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-            if (useAlternateOffset)
-            {
-                unsigned long rt;
-                uint32_t rtMask = state.colorHottileEnable;
-                while (_BitScanForward(&rt, rtMask))
-                {
-                    rtMask &= ~(1 << rt);
-                    psContext.pColorBuffer[rt] +=
-                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer +=
-                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    if (isTileDirty)
-    {
-        SetRenderHotTilesDirty(pDC, renderBuffers);
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESingleSampleBackend, 0);
-}
-
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BEChooserSingleSample
-{
-    // Last Arg Terminator
-    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
-    {
-        switch (tArg)
-        {
-        case SWR_BACKEND_SINGLE_SAMPLE:
-            return BackendSingleSample<SwrBackendTraits<ArgsT...>>;
-            break;
-        case SWR_BACKEND_MSAA_PIXEL_RATE:
-        case SWR_BACKEND_MSAA_SAMPLE_RATE:
-        default:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
-    {
-        switch (tArg)
-        {
-        case SWR_INPUT_COVERAGE_NONE:
-            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
-                remainingArgs...);
-            break;
-        case SWR_INPUT_COVERAGE_NORMAL:
-            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
-                remainingArgs...);
-            break;
-        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
-            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
-                remainingArgs...);
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid sample pattern\n");
-            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
-                remainingArgs...);
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
-    {
-        switch (tArg)
-        {
-        case SWR_MULTISAMPLE_1X:
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_2X:
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_4X:
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_8X:
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_16X:
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid sample count\n");
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
-    {
-        if (tArg == true)
-        {
-            return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
-        }
-
-        return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
-    }
-};
-
-void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
-{
-    for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-    {
-        for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
-        {
-            for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-            {
-                table[inputCoverage][isCentroid][canEarlyZ] =
-                    BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X,
-                                                     false,
-                                                     (SWR_INPUT_COVERAGE)inputCoverage,
-                                                     (isCentroid > 0),
-                                                     false,
-                                                     (canEarlyZ > 0),
-                                                     SWR_BACKEND_SINGLE_SAMPLE);
-            }
-        }
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backends/meson.build b/src/gallium/drivers/swr/rasterizer/core/backends/meson.build
deleted file mode 100644
index d64715dc8be..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/backends/meson.build
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright © 2017-2018 Intel Corporation
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-
-files_swr_common += custom_target(
-  'gen_backend_pixel',
-  input : swr_gen_backends_py,
-  output : [
-    'gen_BackendPixelRate0.cpp', 'gen_BackendPixelRate1.cpp',
-    'gen_BackendPixelRate2.cpp', 'gen_BackendPixelRate3.cpp',
-    'gen_BackendPixelRate.hpp',
-  ],
-  command : [
-    prog_python, '@INPUT@',
-    '--outdir', '@OUTDIR@',
-    '--dim', '5', '2', '3', '2', '2', '2',
-    '--numfiles', '4',
-    '--cpp', '--hpp',
-  ],
-  depend_files : [ swr_gen_backend_files, swr_gen_header_init_files ],
-)
-
-files_swr_common += custom_target(
-  'gen_backend_raster',
-  input : swr_gen_backends_py,
-  output : [
-    'gen_rasterizer0.cpp', 'gen_rasterizer1.cpp',
-    'gen_rasterizer2.cpp', 'gen_rasterizer3.cpp',
-    'gen_rasterizer.hpp',
-  ],
-  command : [
-    prog_python, '@INPUT@',
-    '--outdir', '@OUTDIR@',
-    '--rast',
-    '--dim', '5', '2', '2', '3', '5', '2',
-    '--numfiles', '4',
-    '--cpp', '--hpp',
-  ],
-  depend_files : [ swr_gen_rasterizer_files, swr_gen_header_init_files ],
-)
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
deleted file mode 100644
index 36732289d76..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ /dev/null
@@ -1,1976 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file binner.cpp
- *
- * @brief Implementation for the macrotile binner
- *
- ******************************************************************************/
-
-#include "binner.h"
-#include "context.h"
-#include "frontend.h"
-#include "conservativeRast.h"
-#include "pa.h"
-#include "rasterizer.h"
-#include "rdtsc_core.h"
-#include "tilemgr.h"
-
-// Function Prototype
-template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void BinPostSetupLinesImpl(DRAW_CONTEXT*          pDC,
-                           PA_STATE&              pa,
-                           uint32_t               workerId,
-                           Vec4<SIMD_T>           prim[],
-                           Float<SIMD_T>          recipW[],
-                           uint32_t               primMask,
-                           Integer<SIMD_T> const& primID,
-                           Integer<SIMD_T> const& viewportIdx,
-                           Integer<SIMD_T> const& rtIdx);
-
-template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void BinPostSetupPointsImpl(DRAW_CONTEXT*          pDC,
-                            PA_STATE&              pa,
-                            uint32_t               workerId,
-                            Vec4<SIMD_T>           prim[],
-                            uint32_t               primMask,
-                            Integer<SIMD_T> const& primID,
-                            Integer<SIMD_T> const& viewportIdx,
-                            Integer<SIMD_T> const& rtIdx);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Processes attributes for the backend based on linkage mask and
-///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
-/// @param pDC - Draw context
-/// @param pa - Primitive Assembly state
-/// @param linkageMask - Specifies which VS outputs are routed to PS.
-/// @param pLinkageMap - maps VS attribute slot to PS slot
-/// @param triIndex - Triangle to process attributes for
-/// @param pBuffer - Output result
-template <typename NumVertsT,
-          typename IsSwizzledT,
-          typename HasConstantInterpT,
-          typename IsDegenerate>
-INLINE void ProcessAttributes(
-    DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t triIndex, uint32_t primId, float* pBuffer)
-{
-    static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
-    const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
-    // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
-    uint32_t constantInterpMask =
-        IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
-    const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
-    const PRIMITIVE_TOPOLOGY topo  = pa.binTopology;
-
-    static const float constTable[3][4] = {
-        {0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 1.0f}, {1.0f, 1.0f, 1.0f, 1.0f}};
-
-    for (uint32_t i = 0; i < backendState.numAttributes; ++i)
-    {
-        uint32_t inputSlot;
-        if (IsSwizzledT::value)
-        {
-            SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
-            inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
-        }
-        else
-        {
-            inputSlot = backendState.vertexAttribOffset + i;
-        }
-
-        simd4scalar attrib[3]; // triangle attribs (always 4 wide)
-        float*      pAttribStart = pBuffer;
-
-        if (HasConstantInterpT::value || IsDegenerate::value)
-        {
-            if (CheckBit(constantInterpMask, i))
-            {
-                uint32_t              vid;
-                uint32_t              adjustedTriIndex;
-                static const uint32_t tristripProvokingVertex[]   = {0, 2, 1};
-                static const int32_t  quadProvokingTri[2][4]      = {{0, 0, 0, 1}, {0, -1, 0, 0}};
-                static const uint32_t quadProvokingVertex[2][4]   = {{0, 1, 2, 2}, {0, 1, 1, 2}};
-                static const int32_t  qstripProvokingTri[2][4]    = {{0, 0, 0, 1}, {-1, 0, 0, 0}};
-                static const uint32_t qstripProvokingVertex[2][4] = {{0, 1, 2, 1}, {0, 0, 2, 1}};
-
-                switch (topo)
-                {
-                case TOP_QUAD_LIST:
-                    adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
-                    vid              = quadProvokingVertex[triIndex & 1][provokingVertex];
-                    break;
-                case TOP_QUAD_STRIP:
-                    adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
-                    vid              = qstripProvokingVertex[triIndex & 1][provokingVertex];
-                    break;
-                case TOP_TRIANGLE_STRIP:
-                    adjustedTriIndex = triIndex;
-                    vid =
-                        (triIndex & 1) ? tristripProvokingVertex[provokingVertex] : provokingVertex;
-                    break;
-                default:
-                    adjustedTriIndex = triIndex;
-                    vid              = provokingVertex;
-                    break;
-                }
-
-                pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
-
-                for (uint32_t i = 0; i < NumVertsT::value; ++i)
-                {
-                    SIMD128::store_ps(pBuffer, attrib[vid]);
-                    pBuffer += 4;
-                }
-            }
-            else
-            {
-                pa.AssembleSingle(inputSlot, triIndex, attrib);
-
-                for (uint32_t i = 0; i < NumVertsT::value; ++i)
-                {
-                    SIMD128::store_ps(pBuffer, attrib[i]);
-                    pBuffer += 4;
-                }
-            }
-        }
-        else
-        {
-            pa.AssembleSingle(inputSlot, triIndex, attrib);
-
-            for (uint32_t i = 0; i < NumVertsT::value; ++i)
-            {
-                SIMD128::store_ps(pBuffer, attrib[i]);
-                pBuffer += 4;
-            }
-        }
-
-        // pad out the attrib buffer to 3 verts to ensure the triangle
-        // interpolation code in the pixel shader works correctly for the
-        // 3 topologies - point, line, tri.  This effectively zeros out the
-        // effect of the missing vertices in the triangle interpolation.
-        for (uint32_t v = NumVertsT::value; v < 3; ++v)
-        {
-            SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
-            pBuffer += 4;
-        }
-
-        // check for constant source overrides
-        if (IsSwizzledT::value)
-        {
-            uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
-            if (mask)
-            {
-                unsigned long comp;
-                while (_BitScanForward(&comp, mask))
-                {
-                    mask &= ~(1 << comp);
-
-                    float constantValue = 0.0f;
-                    switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
-                    {
-                    case SWR_CONSTANT_SOURCE_CONST_0000:
-                    case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
-                    case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
-                        constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
-                        break;
-                    case SWR_CONSTANT_SOURCE_PRIM_ID:
-                        constantValue = *(float*)&primId;
-                        break;
-                    }
-
-                    // apply constant value to all 3 vertices
-                    for (uint32_t v = 0; v < 3; ++v)
-                    {
-                        pAttribStart[comp + v * 4] = constantValue;
-                    }
-                }
-            }
-        }
-    }
-}
-
-typedef void (*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
-
-struct ProcessAttributesChooser
-{
-    typedef PFN_PROCESS_ATTRIBUTES FuncType;
-
-    template <typename... ArgsB>
-    static FuncType GetFunc()
-    {
-        return ProcessAttributes<ArgsB...>;
-    }
-};
-
-PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts,
-                                                bool     IsSwizzled,
-                                                bool     HasConstantInterp,
-                                                bool     IsDegenerate = false)
-{
-    return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(
-        IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Processes enabled user clip distances. Loads the active clip
-///        distances from the PA, sets up barycentric equations, and
-///        stores the results to the output buffer
-/// @param pa - Primitive Assembly state
-/// @param primIndex - primitive index to process
-/// @param clipDistMask - mask of enabled clip distances
-/// @param pUserClipBuffer - buffer to store results
-template <uint32_t NumVerts>
-void ProcessUserClipDist(const SWR_BACKEND_STATE& state,
-                         PA_STATE&                pa,
-                         uint32_t                 primIndex,
-                         float*                   pRecipW,
-                         float*                   pUserClipBuffer)
-{
-    unsigned long clipDist;
-    uint32_t clipDistMask = state.clipDistanceMask;
-    while (_BitScanForward(&clipDist, clipDistMask))
-    {
-        clipDistMask &= ~(1 << clipDist);
-        uint32_t clipSlot = clipDist >> 2;
-        uint32_t clipComp = clipDist & 0x3;
-        uint32_t clipAttribSlot =
-            clipSlot == 0 ? state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
-
-        simd4scalar primClipDist[3];
-        pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
-
-        float vertClipDist[NumVerts];
-        for (uint32_t e = 0; e < NumVerts; ++e)
-        {
-            OSALIGNSIMD(float) aVertClipDist[4];
-            SIMD128::store_ps(aVertClipDist, primClipDist[e]);
-            vertClipDist[e] = aVertClipDist[clipComp];
-        };
-
-        // setup plane equations for barycentric interpolation in the backend
-        float baryCoeff[NumVerts];
-        float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
-        for (uint32_t e = 0; e < NumVerts - 1; ++e)
-        {
-            baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
-        }
-        baryCoeff[NumVerts - 1] = last;
-
-        for (uint32_t e = 0; e < NumVerts; ++e)
-        {
-            *(pUserClipBuffer++) = baryCoeff[e];
-        }
-    }
-}
-
-INLINE
-void TransposeVertices(simd4scalar (&dst)[8],
-                       const simdscalar& src0,
-                       const simdscalar& src1,
-                       const simdscalar& src2)
-{
-    vTranspose3x8(dst, src0, src1, src2);
-}
-
-INLINE
-void TransposeVertices(simd4scalar (&dst)[16],
-                       const simd16scalar& src0,
-                       const simd16scalar& src1,
-                       const simd16scalar& src2)
-{
-    vTranspose4x16(
-        reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
-}
-
-#if KNOB_ENABLE_EARLY_RAST
-
-#define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT)
-#define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT)
-
-template <typename SIMD_T>
-struct EarlyRastHelper
-{
-};
-
-template <>
-struct EarlyRastHelper<SIMD256>
-{
-    static SIMD256::Integer InitShiftCntrl()
-    {
-        return SIMD256::set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
-    }
-};
-
-#if USE_SIMD16_FRONTEND
-template <>
-struct EarlyRastHelper<SIMD512>
-{
-    static SIMD512::Integer InitShiftCntrl()
-    {
-        return SIMD512::set_epi32(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-    }
-};
-
-#endif
-//////////////////////////////////////////////////////////////////////////
-/// @brief Early Rasterizer (ER); triangles that fit small (e.g. 4x4) tile
-///        (ER tile) can be rasterized as early as in binner to check if
-///        they cover any  pixels. If not - the triangles can be
-///        culled in binner.
-///
-/// @param er_bbox - coordinates of ER tile for each triangle
-/// @param vAi - A coefficients of triangle edges
-/// @param vBi - B coefficients of triangle edges
-/// @param vXi - X coordinates of triangle vertices
-/// @param vYi - Y coordinates of triangle vertices
-/// @param frontWindingTris - mask indicating CCW/CW triangles
-/// @param triMask - mask for valid SIMD lanes (triangles)
-/// @param oneTileMask - defines triangles for ER to work on
-///                      (tris that fit into ER tile)
-template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
-uint32_t SIMDCALL EarlyRasterizer(DRAW_CONTEXT*       pDC,
-                                  SIMDBBOX_T<SIMD_T>& er_bbox,
-                                  Integer<SIMD_T> (&vAi)[3],
-                                  Integer<SIMD_T> (&vBi)[3],
-                                  Integer<SIMD_T> (&vXi)[3],
-                                  Integer<SIMD_T> (&vYi)[3],
-                                  uint32_t cwTrisMask,
-                                  uint32_t triMask,
-                                  uint32_t oneTileMask)
-{
-    // step to pixel center of top-left pixel of the triangle bbox
-    Integer<SIMD_T> vTopLeftX =
-        SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
-    vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
-
-    Integer<SIMD_T> vTopLeftY =
-        SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
-    vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
-
-    // negate A and B for CW tris
-    Integer<SIMD_T> vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
-    Integer<SIMD_T> vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
-    Integer<SIMD_T> vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
-    Integer<SIMD_T> vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
-    Integer<SIMD_T> vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
-    Integer<SIMD_T> vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
-
-    RDTSC_EVENT(pDC->pContext->pBucketMgr,
-                FEEarlyRastEnter,
-                _mm_popcnt_u32(oneTileMask & triMask),
-                0);
-
-    Integer<SIMD_T> vShiftCntrl = EarlyRastHelper<SIMD_T>::InitShiftCntrl();
-    Integer<SIMD_T> vCwTris     = SIMD_T::set1_epi32(cwTrisMask);
-    Integer<SIMD_T> vMask       = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
-
-    vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
-    vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
-    vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask)));
-    vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask)));
-    vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask)));
-    vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
-
-    // evaluate edge equations at top-left pixel
-    Integer<SIMD_T> vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
-    Integer<SIMD_T> vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
-    Integer<SIMD_T> vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
-
-    Integer<SIMD_T> vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
-    Integer<SIMD_T> vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
-    Integer<SIMD_T> vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
-
-    Integer<SIMD_T> vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
-    Integer<SIMD_T> vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
-    Integer<SIMD_T> vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
-
-    Integer<SIMD_T> vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
-    Integer<SIMD_T> vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
-    Integer<SIMD_T> vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
-
-    Integer<SIMD_T> vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
-    Integer<SIMD_T> vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
-    Integer<SIMD_T> vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
-
-    vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0);
-    vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1);
-    vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2);
-
-    // top left rule
-    Integer<SIMD_T> vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
-    Integer<SIMD_T> vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
-    Integer<SIMD_T> vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
-
-    // vA < 0
-    vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
-    vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1])));
-    vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
-
-    // vA == 0 && vB < 0
-    Integer<SIMD_T> vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
-    Integer<SIMD_T> vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
-    Integer<SIMD_T> vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
-
-    vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]);
-    vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
-    vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]);
-
-    vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0)));
-    vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1)));
-    vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(
-        SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2)));
-
-#if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
-    // Go down
-    // coverage pixel 0
-    Integer<SIMD_T> vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
-    vMask0                 = SIMD_T::and_si(vMask0, vEdge2);
-
-    // coverage pixel 1
-    Integer<SIMD_T> vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
-    Integer<SIMD_T> vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
-    Integer<SIMD_T> vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
-    Integer<SIMD_T> vMask1  = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask1                  = SIMD_T::and_si(vMask1, vEdge2N);
-
-    // coverage pixel 2
-    vEdge0N                = SIMD_T::add_epi32(vEdge0N, vBi[0]);
-    vEdge1N                = SIMD_T::add_epi32(vEdge1N, vBi[1]);
-    vEdge2N                = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    Integer<SIMD_T> vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask2                 = SIMD_T::and_si(vMask2, vEdge2N);
-
-    // coverage pixel 3
-    vEdge0N                = SIMD_T::add_epi32(vEdge0N, vBi[0]);
-    vEdge1N                = SIMD_T::add_epi32(vEdge1N, vBi[1]);
-    vEdge2N                = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    Integer<SIMD_T> vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask3                 = SIMD_T::and_si(vMask3, vEdge2N);
-
-    // One step to the right and then up
-
-    // coverage pixel 4
-    vEdge0N                = SIMD_T::add_epi32(vEdge0N, vAi[0]);
-    vEdge1N                = SIMD_T::add_epi32(vEdge1N, vAi[1]);
-    vEdge2N                = SIMD_T::add_epi32(vEdge2N, vAi[2]);
-    Integer<SIMD_T> vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask4                 = SIMD_T::and_si(vMask4, vEdge2N);
-
-    // coverage pixel 5
-    vEdge0N                = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
-    vEdge1N                = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
-    vEdge2N                = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    Integer<SIMD_T> vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask5                 = SIMD_T::and_si(vMask5, vEdge2N);
-
-    // coverage pixel 6
-    vEdge0N                = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
-    vEdge1N                = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
-    vEdge2N                = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    Integer<SIMD_T> vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask6                 = SIMD_T::and_si(vMask6, vEdge2N);
-
-    // coverage pixel 7
-    vEdge0N                = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
-    vEdge1N                = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
-    vEdge2N                = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    Integer<SIMD_T> vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask7                 = SIMD_T::and_si(vMask7, vEdge2N);
-
-    Integer<SIMD_T> vLit1 = SIMD_T::or_si(vMask0, vMask1);
-    vLit1                 = SIMD_T::or_si(vLit1, vMask2);
-    vLit1                 = SIMD_T::or_si(vLit1, vMask3);
-    vLit1                 = SIMD_T::or_si(vLit1, vMask4);
-    vLit1                 = SIMD_T::or_si(vLit1, vMask5);
-    vLit1                 = SIMD_T::or_si(vLit1, vMask6);
-    vLit1                 = SIMD_T::or_si(vLit1, vMask7);
-
-    // Step to the right and go down again
-
-    // coverage pixel 0
-    vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
-    vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
-    vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
-    vMask0  = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask0  = SIMD_T::and_si(vMask0, vEdge2N);
-
-    // coverage pixel 1
-    vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    vMask1  = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask1  = SIMD_T::and_si(vMask1, vEdge2N);
-
-    // coverage pixel 2
-    vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    vMask2  = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask2  = SIMD_T::and_si(vMask2, vEdge2N);
-
-    // coverage pixel 3
-    vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    vMask3  = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask3  = SIMD_T::and_si(vMask3, vEdge2N);
-
-    // And for the last time - to the right and up
-
-    // coverage pixel 4
-    vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
-    vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
-    vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
-    vMask4  = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask4  = SIMD_T::and_si(vMask4, vEdge2N);
-
-    // coverage pixel 5
-    vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    vMask5  = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask5  = SIMD_T::and_si(vMask5, vEdge2N);
-
-    // coverage pixel 6
-    vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    vMask6  = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask6  = SIMD_T::and_si(vMask6, vEdge2N);
-
-    // coverage pixel 7
-    vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    vMask7  = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask7  = SIMD_T::and_si(vMask7, vEdge2N);
-
-    Integer<SIMD_T> vLit2 = SIMD_T::or_si(vMask0, vMask1);
-    vLit2                 = SIMD_T::or_si(vLit2, vMask2);
-    vLit2                 = SIMD_T::or_si(vLit2, vMask3);
-    vLit2                 = SIMD_T::or_si(vLit2, vMask4);
-    vLit2                 = SIMD_T::or_si(vLit2, vMask5);
-    vLit2                 = SIMD_T::or_si(vLit2, vMask6);
-    vLit2                 = SIMD_T::or_si(vLit2, vMask7);
-
-    Integer<SIMD_T> vLit = SIMD_T::or_si(vLit1, vLit2);
-
-#else
-    // Generic algorithm sweeping in row by row order
-    Integer<SIMD_T> vRowMask[ER_SIMD_TILE_Y_DIM];
-
-    Integer<SIMD_T> vEdge0N = vEdge0;
-    Integer<SIMD_T> vEdge1N = vEdge1;
-    Integer<SIMD_T> vEdge2N = vEdge2;
-
-    for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++)
-    {
-        // Store edge values at the beginning of the row
-        Integer<SIMD_T> vRowEdge0 = vEdge0N;
-        Integer<SIMD_T> vRowEdge1 = vEdge1N;
-        Integer<SIMD_T> vRowEdge2 = vEdge2N;
-
-        Integer<SIMD_T> vColMask[ER_SIMD_TILE_X_DIM];
-
-        for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++)
-        {
-            vColMask[col] = SIMD_T::and_si(vEdge0N, vEdge1N);
-            vColMask[col] = SIMD_T::and_si(vColMask[col], vEdge2N);
-
-            vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
-            vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
-            vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
-        }
-        vRowMask[row] = vColMask[0];
-        for (uint32_t col = 1; col < ER_SIMD_TILE_X_DIM; col++)
-        {
-            vRowMask[row] = SIMD_T::or_si(vRowMask[row], vColMask[col]);
-        }
-        // Restore values and go to the next row
-        vEdge0N = vRowEdge0;
-        vEdge1N = vRowEdge1;
-        vEdge2N = vRowEdge2;
-
-        vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
-        vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
-        vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    }
-
-    // compress all masks
-    Integer<SIMD_T> vLit = vRowMask[0];
-    for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++)
-    {
-        vLit = SIMD_T::or_si(vLit, vRowMask[row]);
-    }
-
-#endif
-    // Check which triangles has any pixel lit
-    uint32_t maskLit   = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit));
-    uint32_t maskUnlit = ~maskLit & oneTileMask;
-
-    uint32_t oldTriMask = triMask;
-    triMask &= ~maskUnlit;
-
-    if (triMask ^ oldTriMask)
-    {
-        RDTSC_EVENT(pDC->pContext->pBucketMgr,
-                    FEEarlyRastExit,
-                    _mm_popcnt_u32(triMask & oneTileMask),
-                    0);
-    }
-    return triMask;
-}
-
-#endif // Early rasterizer
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
-///        culling, viewport transform, etc.
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains triangle position data for SIMDs worth of triangles.
-/// @param primID - Primitive ID for each triangle.
-/// @param viewportIdx - viewport array index for each triangle.
-/// @tparam CT - ConservativeRastFETraits
-template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
-void SIMDCALL BinTrianglesImpl(DRAW_CONTEXT*          pDC,
-                               PA_STATE&              pa,
-                               uint32_t               workerId,
-                               Vec4<SIMD_T>           tri[3],
-                               uint32_t               triMask,
-                               Integer<SIMD_T> const& primID,
-                               Integer<SIMD_T> const& viewportIdx,
-                               Integer<SIMD_T> const& rtIdx)
-{
-    const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinTriangles, pDC->drawId);
-
-    const API_STATE&          state     = GetApiState(pDC);
-    const SWR_RASTSTATE&      rastState = state.rastState;
-    const SWR_FRONTEND_STATE& feState   = state.frontendState;
-
-    MacroTileMgr* pTileMgr = pDC->pTileMgr;
-
-    Float<SIMD_T> vRecipW0 = SIMD_T::set1_ps(1.0f);
-    Float<SIMD_T> vRecipW1 = SIMD_T::set1_ps(1.0f);
-    Float<SIMD_T> vRecipW2 = SIMD_T::set1_ps(1.0f);
-
-    if (feState.vpTransformDisable)
-    {
-        // RHW is passed in directly when VP transform is disabled
-        vRecipW0 = tri[0].v[3];
-        vRecipW1 = tri[1].v[3];
-        vRecipW2 = tri[2].v[3];
-    }
-    else
-    {
-        // Perspective divide
-        vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
-        vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
-        vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
-
-        tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
-        tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
-        tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
-
-        tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
-        tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
-        tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
-
-        tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
-        tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
-        tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
-
-        // Viewport transform to screen space coords
-        if (pa.viewportArrayActive)
-        {
-            viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
-        }
-        else
-        {
-            viewportTransform<3>(tri, state.vpMatrices);
-        }
-    }
-
-    // Adjust for pixel center location
-    Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
-
-    tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
-    tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
-
-    tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
-    tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
-
-    tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
-    tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
-
-    // Set vXi, vYi to required fixed point precision
-    Integer<SIMD_T> vXi[3], vYi[3];
-    FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
-
-    // triangle setup
-    Integer<SIMD_T> vAi[3], vBi[3];
-    triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
-
-    // determinant
-    Integer<SIMD_T> vDet[2];
-    calcDeterminantIntVertical(vAi, vBi, vDet);
-
-    // cull zero area
-    uint32_t maskLo =
-        SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
-    uint32_t maskHi =
-        SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
-
-    uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
-
-    // don't cull degenerate triangles if we're conservatively rasterizing
-    uint32_t origTriMask = triMask;
-    if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
-    {
-        triMask &= ~cullZeroAreaMask;
-    }
-
-    // determine front winding tris
-    // CW  +det
-    // CCW det < 0;
-    // 0 area triangles are marked as backfacing regardless of winding order,
-    // which is required behavior for conservative rast and wireframe rendering
-    uint32_t frontWindingTris;
-    if (rastState.frontWinding == SWR_FRONTWINDING_CW)
-    {
-        maskLo = SIMD_T::movemask_pd(
-            SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
-        maskHi = SIMD_T::movemask_pd(
-            SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
-    }
-    else
-    {
-        maskLo = SIMD_T::movemask_pd(
-            SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
-        maskHi = SIMD_T::movemask_pd(
-            SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
-    }
-    frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
-
-    // cull
-    uint32_t cullTris;
-    switch ((SWR_CULLMODE)rastState.cullMode)
-    {
-    case SWR_CULLMODE_BOTH:
-        cullTris = 0xffffffff;
-        break;
-    case SWR_CULLMODE_NONE:
-        cullTris = 0x0;
-        break;
-    case SWR_CULLMODE_FRONT:
-        cullTris = frontWindingTris;
-        break;
-        // 0 area triangles are marked as backfacing, which is required behavior for conservative
-        // rast
-    case SWR_CULLMODE_BACK:
-        cullTris = ~frontWindingTris;
-        break;
-    default:
-        SWR_INVALID("Invalid cull mode: %d", rastState.cullMode);
-        cullTris = 0x0;
-        break;
-    }
-
-    triMask &= ~cullTris;
-
-    if (origTriMask ^ triMask)
-    {
-        RDTSC_EVENT(pDC->pContext->pBucketMgr,
-                    FECullZeroAreaAndBackface,
-                    _mm_popcnt_u32(origTriMask ^ triMask),
-                    0);
-    }
-
-    AR_EVENT(CullInfoEvent(pDC->drawId, cullZeroAreaMask, cullTris, origTriMask));
-
-    /// Note: these variable initializations must stay above any 'goto endBenTriangles'
-    // compute per tri backface
-    uint32_t        frontFaceMask  = frontWindingTris;
-    uint32_t*       pPrimID        = (uint32_t*)&primID;
-    const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx;
-    uint32_t        triIndex       = 0;
-
-    uint32_t      edgeEnable;
-    PFN_WORK_FUNC pfnWork;
-    if (CT::IsConservativeT::value)
-    {
-        // determine which edges of the degenerate tri, if any, are valid to rasterize.
-        // used to call the appropriate templated rasterizer function
-        if (cullZeroAreaMask > 0)
-        {
-            // e0 = v1-v0
-            const Integer<SIMD_T> x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
-            const Integer<SIMD_T> y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
-
-            uint32_t e0Mask =
-                SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
-
-            // e1 = v2-v1
-            const Integer<SIMD_T> x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
-            const Integer<SIMD_T> y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
-
-            uint32_t e1Mask =
-                SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
-
-            // e2 = v0-v2
-            // if v0 == v1 & v1 == v2, v0 == v2
-            uint32_t e2Mask = e0Mask & e1Mask;
-            SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
-
-            // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
-            // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
-            e0Mask = pdep_u32(e0Mask, 0x00249249);
-
-            // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
-            e1Mask = pdep_u32(e1Mask, 0x00492492);
-
-            // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
-            e2Mask = pdep_u32(e2Mask, 0x00924924);
-
-            edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
-        }
-        else
-        {
-            edgeEnable = 0x00FFFFFF;
-        }
-    }
-    else
-    {
-        // degenerate triangles won't be sent to rasterizer; just enable all edges
-        pfnWork = GetRasterizerFunc(rastState.sampleCount,
-                                    rastState.bIsCenterPattern,
-                                    (rastState.conservativeRast > 0),
-                                    (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage,
-                                    EdgeValToEdgeState(ALL_EDGES_VALID),
-                                    (state.scissorsTileAligned == false));
-    }
-
-    SIMDBBOX_T<SIMD_T> bbox;
-
-    if (!triMask)
-    {
-        goto endBinTriangles;
-    }
-
-    // Calc bounding box of triangles
-    calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
-
-    // determine if triangle falls between pixel centers and discard
-    // only discard for non-MSAA case and when conservative rast is disabled
-    // (xmin + 127) & ~255
-    // (xmax + 128) & ~255
-    if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
-        (!CT::IsConservativeT::value))
-    {
-        origTriMask = triMask;
-
-        int cullCenterMask;
-
-        {
-            Integer<SIMD_T> xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
-            xmin                 = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
-            Integer<SIMD_T> xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
-            xmax                 = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
-
-            Integer<SIMD_T> vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
-
-            Integer<SIMD_T> ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
-            ymin                 = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
-            Integer<SIMD_T> ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
-            ymax                 = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
-
-            Integer<SIMD_T> vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
-
-            vMaskV         = SIMD_T::or_si(vMaskH, vMaskV);
-            cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
-        }
-
-        triMask &= ~cullCenterMask;
-
-        if (origTriMask ^ triMask)
-        {
-            RDTSC_EVENT(pDC->pContext->pBucketMgr,
-                        FECullBetweenCenters,
-                        _mm_popcnt_u32(origTriMask ^ triMask),
-                        0);
-        }
-    }
-
-    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is
-    // exclusive. Gather the AOS effective scissor rects based on the per-prim VP index.
-    /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-    {
-        Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
-        if (pa.viewportArrayActive)
-
-        {
-            GatherScissors(&state.scissorsInFixedPoint[0],
-                           pViewportIndex,
-                           scisXmin,
-                           scisYmin,
-                           scisXmax,
-                           scisYmax);
-        }
-        else // broadcast fast path for non-VPAI case.
-        {
-            scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
-            scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
-            scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
-            scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
-        }
-
-        // Make triangle bbox inclusive
-        bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
-        bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
-
-        bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
-        bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
-        bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
-    }
-
-    if (CT::IsConservativeT::value)
-    {
-        // in the case where a degenerate triangle is on a scissor edge, we need to make sure the
-        // primitive bbox has some area. Bump the xmax/ymax edges out
-
-        Integer<SIMD_T> topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
-        bbox.ymax                       = SIMD_T::blendv_epi32(
-            bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
-
-        Integer<SIMD_T> leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
-        bbox.xmax                       = SIMD_T::blendv_epi32(
-            bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
-    }
-
-    // Cull tris completely outside scissor
-    {
-        Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
-        Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
-        Integer<SIMD_T> maskOutsideScissorXY =
-            SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
-        uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
-        triMask                     = triMask & ~maskOutsideScissor;
-    }
-
-#if KNOB_ENABLE_EARLY_RAST
-    if (rastState.sampleCount == SWR_MULTISAMPLE_1X && !CT::IsConservativeT::value)
-    {
-        // Try early rasterization - culling small triangles which do not cover any pixels
-
-        // convert to ER tiles
-        SIMDBBOX_T<SIMD_T> er_bbox;
-
-        er_bbox.xmin =
-            SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin);
-        er_bbox.xmax =
-            SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax);
-        er_bbox.ymin =
-            SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
-        er_bbox.ymax =
-            SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
-
-        Integer<SIMD_T> vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
-        Integer<SIMD_T> vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
-
-        // Take only triangles that fit into ER tile
-        uint32_t oneTileMask =
-            triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
-
-        if (oneTileMask)
-        {
-            // determine CW tris (det > 0)
-            uint32_t maskCwLo = SIMD_T::movemask_pd(
-                SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
-            uint32_t maskCwHi = SIMD_T::movemask_pd(
-                SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
-            uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2));
-
-            // Try early rasterization
-            triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(
-                pDC, er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask);
-
-            if (!triMask)
-            {
-                RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
-                return;
-            }
-        }
-    }
-#endif
-
-endBinTriangles:
-
-
-    if (!triMask)
-    {
-        RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
-        return;
-    }
-
-    // Send surviving triangles to the line or point binner based on fill mode
-    if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
-    {
-        // Simple non-conformant wireframe mode, useful for debugging
-        // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
-        Vec4<SIMD_T>  line[2];
-        Float<SIMD_T> recipW[2];
-
-        line[0]   = tri[0];
-        line[1]   = tri[1];
-        recipW[0] = vRecipW0;
-        recipW[1] = vRecipW1;
-
-        BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
-            pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
-
-        line[0]   = tri[1];
-        line[1]   = tri[2];
-        recipW[0] = vRecipW1;
-        recipW[1] = vRecipW2;
-
-        BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
-            pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
-
-        line[0]   = tri[2];
-        line[1]   = tri[0];
-        recipW[0] = vRecipW2;
-        recipW[1] = vRecipW0;
-
-        BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
-            pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
-
-        RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
-        return;
-    }
-    else if (rastState.fillMode == SWR_FILLMODE_POINT)
-    {
-        // Bin 3 points
-        BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
-            pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx);
-        BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
-            pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
-        BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
-            pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
-
-        RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
-        return;
-    }
-
-    // Convert triangle bbox to macrotile units.
-    bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
-    bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
-    bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
-    bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
-
-    OSALIGNSIMD16(uint32_t)
-    aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
-
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
-
-    // transpose verts needed for backend
-    /// @todo modify BE to take non-transformed verts
-    OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
-    OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
-    OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
-    OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
-
-    TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
-    TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
-    TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
-    TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
-
-    // scan remaining valid triangles and bin each separately
-    while (_BitScanForward((unsigned long*)&triIndex, triMask))
-    {
-        uint32_t linkageCount     = state.backendState.numAttributes;
-        uint32_t numScalarAttribs = linkageCount * 4;
-
-        BE_WORK work;
-        work.type = DRAW;
-
-        bool isDegenerate;
-        if (CT::IsConservativeT::value)
-        {
-            // only rasterize valid edges if we have a degenerate primitive
-            int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
-            work.pfnWork =
-                GetRasterizerFunc(rastState.sampleCount,
-                                  rastState.bIsCenterPattern,
-                                  (rastState.conservativeRast > 0),
-                                  (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage,
-                                  EdgeValToEdgeState(triEdgeEnable),
-                                  (state.scissorsTileAligned == false));
-
-            // Degenerate triangles are required to be constant interpolated
-            isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
-        }
-        else
-        {
-            isDegenerate = false;
-            work.pfnWork = pfnWork;
-        }
-
-        // Select attribute processor
-        PFN_PROCESS_ATTRIBUTES pfnProcessAttribs =
-            GetProcessAttributesFunc(3,
-                                     state.backendState.swizzleEnable,
-                                     state.backendState.constantInterpolationMask,
-                                     isDegenerate);
-
-        TRIANGLE_WORK_DESC& desc = work.desc.tri;
-
-        desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
-        desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
-        desc.triFlags.viewportIndex          = pViewportIndex[triIndex];
-
-        auto pArena = pDC->pArena;
-        SWR_ASSERT(pArena != nullptr);
-
-        // store active attribs
-        float* pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
-        desc.pAttribs   = pAttribs;
-        desc.numAttribs = linkageCount;
-        pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
-
-        // store triangle vertex data
-        desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-
-        SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
-        SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
-        SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
-        SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
-
-        // store user clip distances
-        if (state.backendState.clipDistanceMask)
-        {
-            uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
-            desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
-            ProcessUserClipDist<3>(
-                state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
-        }
-
-        for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
-        {
-            for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
-            {
-#if KNOB_ENABLE_TOSS_POINTS
-                if (!KNOB_TOSS_SETUP_TRIS)
-#endif
-                {
-                    pTileMgr->enqueue(x, y, &work);
-                }
-            }
-        }
-
-        triMask &= ~(1 << triIndex);
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
-}
-
-template <typename CT>
-void BinTriangles(DRAW_CONTEXT*      pDC,
-                  PA_STATE&          pa,
-                  uint32_t           workerId,
-                  simdvector         tri[3],
-                  uint32_t           triMask,
-                  simdscalari const& primID,
-                  simdscalari const& viewportIdx,
-                  simdscalari const& rtIdx)
-{
-    BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(
-        pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
-}
-
-#if USE_SIMD16_FRONTEND
-template <typename CT>
-void SIMDCALL BinTriangles_simd16(DRAW_CONTEXT*        pDC,
-                                  PA_STATE&            pa,
-                                  uint32_t             workerId,
-                                  simd16vector         tri[3],
-                                  uint32_t             triMask,
-                                  simd16scalari const& primID,
-                                  simd16scalari const& viewportIdx,
-                                  simd16scalari const& rtIdx)
-{
-    BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(
-        pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
-}
-
-#endif
-struct FEBinTrianglesChooser
-{
-    typedef PFN_PROCESS_PRIMS FuncType;
-
-    template <typename... ArgsB>
-    static FuncType GetFunc()
-    {
-        return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
-    }
-};
-
-// Selector for correct templated BinTrinagles function
-PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
-{
-    return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
-}
-
-#if USE_SIMD16_FRONTEND
-struct FEBinTrianglesChooser_simd16
-{
-    typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
-
-    template <typename... ArgsB>
-    static FuncType GetFunc()
-    {
-        return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
-    }
-};
-
-// Selector for correct templated BinTrinagles function
-PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
-{
-    return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
-}
-
-#endif
-
-template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void BinPostSetupPointsImpl(DRAW_CONTEXT*          pDC,
-                            PA_STATE&              pa,
-                            uint32_t               workerId,
-                            Vec4<SIMD_T>           prim[],
-                            uint32_t               primMask,
-                            Integer<SIMD_T> const& primID,
-                            Integer<SIMD_T> const& viewportIdx,
-                            Integer<SIMD_T> const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinPoints, pDC->drawId);
-
-    Vec4<SIMD_T>& primVerts = prim[0];
-
-    const API_STATE&     state          = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState      = state.rastState;
-    const uint32_t*      pViewportIndex = (uint32_t*)&viewportIdx;
-
-    // Select attribute processor
-    PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(
-        1, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
-
-    // convert to fixed point
-    Integer<SIMD_T> vXi, vYi;
-
-    vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
-    vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
-
-    if (CanUseSimplePoints(pDC))
-    {
-        // adjust for ymin-xmin rule
-        vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
-        vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
-
-        // cull points off the ymin-xmin edge of the viewport
-        primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
-        primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
-
-        // compute macro tile coordinates
-        Integer<SIMD_T> macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
-        Integer<SIMD_T> macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
-
-        OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
-
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroX), macroX);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroY), macroY);
-
-        // compute raster tile coordinates
-        Integer<SIMD_T> rasterX =
-            SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
-        Integer<SIMD_T> rasterY =
-            SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
-
-        // compute raster tile relative x,y for coverage mask
-        Integer<SIMD_T> tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
-        Integer<SIMD_T> tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
-
-        Integer<SIMD_T> tileRelativeX =
-            SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
-        Integer<SIMD_T> tileRelativeY =
-            SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
-
-        OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
-        OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
-
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeX), tileRelativeX);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeY), tileRelativeY);
-
-        OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
-        OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
-
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedX), tileAlignedX);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedY), tileAlignedY);
-
-        OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
-        SIMD_T::store_ps(reinterpret_cast<float*>(aZ), primVerts.z);
-
-        // store render target array index
-        const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
-
-        uint32_t* pPrimID   = (uint32_t*)&primID;
-        uint32_t  primIndex = 0;
-
-        const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
-
-        // scan remaining valid triangles and bin each separately
-        while (_BitScanForward((unsigned long*)&primIndex, primMask))
-        {
-            uint32_t linkageCount     = backendState.numAttributes;
-            uint32_t numScalarAttribs = linkageCount * 4;
-
-            BE_WORK work;
-            work.type = DRAW;
-
-            TRIANGLE_WORK_DESC& desc = work.desc.tri;
-
-            // points are always front facing
-            desc.triFlags.frontFacing            = 1;
-            desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-            desc.triFlags.viewportIndex          = pViewportIndex[primIndex];
-
-            work.pfnWork = RasterizeSimplePoint;
-
-            auto pArena = pDC->pArena;
-            SWR_ASSERT(pArena != nullptr);
-
-            // store attributes
-            float* pAttribs =
-                (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
-            desc.pAttribs   = pAttribs;
-            desc.numAttribs = linkageCount;
-
-            pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
-
-            // store raster tile aligned x, y, perspective correct z
-            float* pTriBuffer        = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
-            desc.pTriBuffer          = pTriBuffer;
-            *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
-            *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
-            *pTriBuffer              = aZ[primIndex];
-
-            uint32_t tX = aTileRelativeX[primIndex];
-            uint32_t tY = aTileRelativeY[primIndex];
-
-            // pack the relative x,y into the coverageMask, the rasterizer will
-            // generate the true coverage mask from it
-            work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
-
-            // bin it
-            MacroTileMgr* pTileMgr = pDC->pTileMgr;
-#if KNOB_ENABLE_TOSS_POINTS
-            if (!KNOB_TOSS_SETUP_TRIS)
-#endif
-            {
-                pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
-            }
-
-            primMask &= ~(1 << primIndex);
-        }
-    }
-    else
-    {
-        // non simple points need to be potentially binned to multiple macro tiles
-        Float<SIMD_T> vPointSize;
-
-        if (rastState.pointParam)
-        {
-            Vec4<SIMD_T> size[3];
-            pa.Assemble(VERTEX_SGV_SLOT, size);
-            vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
-        }
-        else
-        {
-            vPointSize = SIMD_T::set1_ps(rastState.pointSize);
-        }
-
-        // bloat point to bbox
-        SIMDBBOX_T<SIMD_T> bbox;
-
-        bbox.xmin = bbox.xmax = vXi;
-        bbox.ymin = bbox.ymax = vYi;
-
-        Float<SIMD_T>   vHalfWidth  = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
-        Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
-
-        bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
-        bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
-        bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
-        bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
-
-        // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge
-        // is exclusive. Gather the AOS effective scissor rects based on the per-prim VP index.
-        /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-        {
-            Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
-
-            if (pa.viewportArrayActive)
-            {
-                GatherScissors(&state.scissorsInFixedPoint[0],
-                               pViewportIndex,
-                               scisXmin,
-                               scisYmin,
-                               scisXmax,
-                               scisYmax);
-            }
-            else // broadcast fast path for non-VPAI case.
-            {
-                scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
-                scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
-                scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
-                scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
-            }
-
-            bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
-            bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
-            bbox.xmax =
-                SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
-            bbox.ymax =
-                SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
-        }
-
-        // Cull bloated points completely outside scissor
-        Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
-        Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
-        Integer<SIMD_T> maskOutsideScissorXY =
-            SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
-        uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
-        primMask                    = primMask & ~maskOutsideScissor;
-
-        // Convert bbox to macrotile units.
-        bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
-        bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
-        bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
-        bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
-
-        OSALIGNSIMD16(uint32_t)
-        aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
-
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
-
-        // store render target array index
-        const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
-
-        OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
-        SIMD_T::store_ps(reinterpret_cast<float*>(aPointSize), vPointSize);
-
-        uint32_t* pPrimID = (uint32_t*)&primID;
-
-        OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
-        OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
-        OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
-
-        SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsX), primVerts.x);
-        SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsY), primVerts.y);
-        SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsZ), primVerts.z);
-
-        // scan remaining valid prims and bin each separately
-        const SWR_BACKEND_STATE& backendState = state.backendState;
-        uint32_t                 primIndex;
-        while (_BitScanForward((unsigned long*)&primIndex, primMask))
-        {
-            uint32_t linkageCount     = backendState.numAttributes;
-            uint32_t numScalarAttribs = linkageCount * 4;
-
-            BE_WORK work;
-            work.type = DRAW;
-
-            TRIANGLE_WORK_DESC& desc = work.desc.tri;
-
-            desc.triFlags.frontFacing            = 1;
-            desc.triFlags.pointSize              = aPointSize[primIndex];
-            desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-            desc.triFlags.viewportIndex          = pViewportIndex[primIndex];
-
-            work.pfnWork = RasterizeTriPoint;
-
-            auto pArena = pDC->pArena;
-            SWR_ASSERT(pArena != nullptr);
-
-            // store active attribs
-            desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
-            desc.numAttribs = linkageCount;
-            pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
-
-            // store point vertex data
-            float* pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
-            desc.pTriBuffer   = pTriBuffer;
-            *pTriBuffer++     = aPrimVertsX[primIndex];
-            *pTriBuffer++     = aPrimVertsY[primIndex];
-            *pTriBuffer       = aPrimVertsZ[primIndex];
-
-            // store user clip distances
-            if (backendState.clipDistanceMask)
-            {
-                uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
-                desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
-                float dists[8];
-                float one = 1.0f;
-                ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
-                for (uint32_t i = 0; i < numClipDist; i++)
-                {
-                    desc.pUserClipBuffer[3 * i + 0] = 0.0f;
-                    desc.pUserClipBuffer[3 * i + 1] = 0.0f;
-                    desc.pUserClipBuffer[3 * i + 2] = dists[i];
-                }
-            }
-
-            MacroTileMgr* pTileMgr = pDC->pTileMgr;
-            for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
-            {
-                for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
-                {
-#if KNOB_ENABLE_TOSS_POINTS
-                    if (!KNOB_TOSS_SETUP_TRIS)
-#endif
-                    {
-                        pTileMgr->enqueue(x, y, &work);
-                    }
-                }
-            }
-
-            primMask &= ~(1 << primIndex);
-        }
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEBinPoints, 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD points to the backend.  Only supports point size of 1
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains point position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each point.
-template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void BinPointsImpl(DRAW_CONTEXT*          pDC,
-                   PA_STATE&              pa,
-                   uint32_t               workerId,
-                   Vec4<SIMD_T>           prim[3],
-                   uint32_t               primMask,
-                   Integer<SIMD_T> const& primID,
-                   Integer<SIMD_T> const& viewportIdx,
-                   Integer<SIMD_T> const& rtIdx)
-{
-    const API_STATE&          state     = GetApiState(pDC);
-    const SWR_FRONTEND_STATE& feState   = state.frontendState;
-    const SWR_RASTSTATE&      rastState = state.rastState;
-
-    if (!feState.vpTransformDisable)
-    {
-        // perspective divide
-        Float<SIMD_T> vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
-
-        prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
-        prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
-        prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
-
-        // viewport transform to screen coords
-        if (pa.viewportArrayActive)
-        {
-            viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
-        }
-        else
-        {
-            viewportTransform<1>(prim, state.vpMatrices);
-        }
-    }
-
-    Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
-
-    prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
-    prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
-
-    BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
-        pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
-}
-
-void BinPoints(DRAW_CONTEXT*      pDC,
-               PA_STATE&          pa,
-               uint32_t           workerId,
-               simdvector         prim[3],
-               uint32_t           primMask,
-               simdscalari const& primID,
-               simdscalari const& viewportIdx,
-               simdscalari const& rtIdx)
-{
-    BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
-        pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
-}
-
-#if USE_SIMD16_FRONTEND
-void SIMDCALL BinPoints_simd16(DRAW_CONTEXT*        pDC,
-                               PA_STATE&            pa,
-                               uint32_t             workerId,
-                               simd16vector         prim[3],
-                               uint32_t             primMask,
-                               simd16scalari const& primID,
-                               simd16scalari const& viewportIdx,
-                               simd16scalari const& rtIdx)
-{
-    BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
-        pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
-}
-
-#endif
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD lines to the backend.
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains line position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each line.
-/// @param viewportIdx - Viewport Array Index for each line.
-template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void BinPostSetupLinesImpl(DRAW_CONTEXT*          pDC,
-                           PA_STATE&              pa,
-                           uint32_t               workerId,
-                           Vec4<SIMD_T>           prim[],
-                           Float<SIMD_T>          recipW[],
-                           uint32_t               primMask,
-                           Integer<SIMD_T> const& primID,
-                           Integer<SIMD_T> const& viewportIdx,
-                           Integer<SIMD_T> const& rtIdx)
-{
-    const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinLines, pDC->drawId);
-
-    const API_STATE&     state     = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-
-    // Select attribute processor
-    PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(
-        2, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
-
-    Float<SIMD_T>& vRecipW0 = recipW[0];
-    Float<SIMD_T>& vRecipW1 = recipW[1];
-
-    // convert to fixed point
-    Integer<SIMD_T> vXi[2], vYi[2];
-
-    vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
-    vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
-    vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
-    vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
-
-    // compute x-major vs y-major mask
-    Integer<SIMD_T> xLength     = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
-    Integer<SIMD_T> yLength     = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
-    Float<SIMD_T>   vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
-    uint32_t        yMajorMask  = SIMD_T::movemask_ps(vYmajorMask);
-
-    // cull zero-length lines
-    Integer<SIMD_T> vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
-    vZeroLengthMask =
-        SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
-
-    primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
-
-    uint32_t*       pPrimID        = (uint32_t*)&primID;
-    const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx;
-
-    // Calc bounding box of lines
-    SIMDBBOX_T<SIMD_T> bbox;
-    bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
-    bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
-    bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
-    bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
-
-    // bloat bbox by line width along minor axis
-    Float<SIMD_T>   vHalfWidth  = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
-    Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
-
-    SIMDBBOX_T<SIMD_T> bloatBox;
-
-    bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
-    bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
-    bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
-    bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
-
-    bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
-    bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
-    bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
-    bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
-
-    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is
-    // exclusive.
-    {
-        Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
-
-        if (pa.viewportArrayActive)
-        {
-            GatherScissors(&state.scissorsInFixedPoint[0],
-                           pViewportIndex,
-                           scisXmin,
-                           scisYmin,
-                           scisXmax,
-                           scisYmax);
-        }
-        else // broadcast fast path for non-VPAI case.
-        {
-            scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
-            scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
-            scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
-            scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
-        }
-
-        bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
-        bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax =
-            SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
-        bbox.ymax =
-            SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
-    }
-
-    // Cull prims completely outside scissor
-    {
-        Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
-        Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
-        Integer<SIMD_T> maskOutsideScissorXY =
-            SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
-        uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
-        primMask                    = primMask & ~maskOutsideScissor;
-    }
-
-    // transpose verts needed for backend
-    /// @todo modify BE to take non-transformed verts
-    OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
-    OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
-    OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
-    OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
-
-    if (!primMask)
-    {
-        goto endBinLines;
-    }
-
-    // Convert triangle bbox to macrotile units.
-    bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
-    bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
-    bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
-    bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
-
-    OSALIGNSIMD16(uint32_t)
-    aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
-
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
-
-    TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
-    TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
-    TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
-    TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
-
-    // scan remaining valid prims and bin each separately
-    unsigned long primIndex;
-    while (_BitScanForward(&primIndex, primMask))
-    {
-        uint32_t linkageCount     = state.backendState.numAttributes;
-        uint32_t numScalarAttribs = linkageCount * 4;
-
-        BE_WORK work;
-        work.type = DRAW;
-
-        TRIANGLE_WORK_DESC& desc = work.desc.tri;
-
-        desc.triFlags.frontFacing            = 1;
-        desc.triFlags.yMajor                 = (yMajorMask >> primIndex) & 1;
-        desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-        desc.triFlags.viewportIndex          = pViewportIndex[primIndex];
-
-        work.pfnWork = RasterizeLine;
-
-        auto pArena = pDC->pArena;
-        SWR_ASSERT(pArena != nullptr);
-
-        // store active attribs
-        desc.pAttribs   = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
-        desc.numAttribs = linkageCount;
-        pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
-
-        // store line vertex data
-        desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-
-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
-
-        // store user clip distances
-        if (state.backendState.clipDistanceMask)
-        {
-            uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
-            desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
-            ProcessUserClipDist<2>(
-                state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
-        }
-
-        MacroTileMgr* pTileMgr = pDC->pTileMgr;
-        for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
-        {
-            for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
-            {
-#if KNOB_ENABLE_TOSS_POINTS
-                if (!KNOB_TOSS_SETUP_TRIS)
-#endif
-                {
-                    pTileMgr->enqueue(x, y, &work);
-                }
-            }
-        }
-
-        primMask &= ~(1 << primIndex);
-    }
-
-endBinLines:
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEBinLines, 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD lines to the backend.
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains line position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each line.
-/// @param viewportIdx - Viewport Array Index for each line.
-template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void SIMDCALL BinLinesImpl(DRAW_CONTEXT*          pDC,
-                           PA_STATE&              pa,
-                           uint32_t               workerId,
-                           Vec4<SIMD_T>           prim[3],
-                           uint32_t               primMask,
-                           Integer<SIMD_T> const& primID,
-                           Integer<SIMD_T> const& viewportIdx,
-                           Integer<SIMD_T> const& rtIdx)
-{
-    const API_STATE&          state     = GetApiState(pDC);
-    const SWR_RASTSTATE&      rastState = state.rastState;
-    const SWR_FRONTEND_STATE& feState   = state.frontendState;
-
-    Float<SIMD_T> vRecipW[2] = {SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f)};
-
-    if (!feState.vpTransformDisable)
-    {
-        // perspective divide
-        vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
-        vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
-
-        prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
-        prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
-
-        prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
-        prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
-
-        prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
-        prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
-
-        // viewport transform to screen coords
-        if (pa.viewportArrayActive)
-        {
-            viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
-        }
-        else
-        {
-            viewportTransform<2>(prim, state.vpMatrices);
-        }
-    }
-
-    // adjust for pixel center location
-    Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
-
-    prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
-    prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
-
-    prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
-    prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
-
-    BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
-        pDC, pa, workerId, prim, vRecipW, primMask, primID, viewportIdx, rtIdx);
-}
-
-void BinLines(DRAW_CONTEXT*      pDC,
-              PA_STATE&          pa,
-              uint32_t           workerId,
-              simdvector         prim[],
-              uint32_t           primMask,
-              simdscalari const& primID,
-              simdscalari const& viewportIdx,
-              simdscalari const& rtIdx)
-{
-    BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(
-        pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
-}
-
-#if USE_SIMD16_FRONTEND
-void SIMDCALL BinLines_simd16(DRAW_CONTEXT*        pDC,
-                              PA_STATE&            pa,
-                              uint32_t             workerId,
-                              simd16vector         prim[3],
-                              uint32_t             primMask,
-                              simd16scalari const& primID,
-                              simd16scalari const& viewportIdx,
-                              simd16scalari const& rtIdx)
-{
-    BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(
-        pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
-}
-
-#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h b/src/gallium/drivers/swr/rasterizer/core/binner.h
deleted file mode 100644
index 63be8f67cbf..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/binner.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file binner.h
- *
- * @brief Declaration for the macrotile binner
- *
- ******************************************************************************/
-#include "state.h"
-#include "conservativeRast.h"
-#include "utils.h"
-//////////////////////////////////////////////////////////////////////////
-/// @brief Offsets added to post-viewport vertex positions based on
-/// raster state.
-///
-/// Can't use templated variable because we must stick with C++11 features.
-/// Template variables were introduced with C++14
-template <typename SIMD_T>
-struct SwrPixelOffsets
-{
-public:
-    INLINE static Float<SIMD_T> GetOffset(uint32_t loc)
-    {
-        SWR_ASSERT(loc <= 1);
-
-        return SIMD_T::set1_ps(loc ? 0.5f : 0.0f);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert the X,Y coords of a triangle to the requested Fixed
-/// Point precision from FP32.
-template <typename SIMD_T, typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE Integer<SIMD_T> fpToFixedPointVertical(const Float<SIMD_T>& vIn)
-{
-    return SIMD_T::cvtps_epi32(SIMD_T::mul_ps(vIn, SIMD_T::set1_ps(PT::ScaleT::value)));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Helper function to set the X,Y coords of a triangle to the
-/// requested Fixed Point precision from FP32.
-/// @param tri: simdvector[3] of FP triangle verts
-/// @param vXi: fixed point X coords of tri verts
-/// @param vYi: fixed point Y coords of tri verts
-template <typename SIMD_T>
-INLINE static void
-FPToFixedPoint(const Vec4<SIMD_T>* const tri, Integer<SIMD_T> (&vXi)[3], Integer<SIMD_T> (&vYi)[3])
-{
-    vXi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].x);
-    vYi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].y);
-    vXi[1] = fpToFixedPointVertical<SIMD_T>(tri[1].x);
-    vYi[1] = fpToFixedPointVertical<SIMD_T>(tri[1].y);
-    vXi[2] = fpToFixedPointVertical<SIMD_T>(tri[2].x);
-    vYi[2] = fpToFixedPointVertical<SIMD_T>(tri[2].y);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Calculate bounding box for current triangle
-/// @tparam CT: ConservativeRastFETraits type
-/// @param vX: fixed point X position for triangle verts
-/// @param vY: fixed point Y position for triangle verts
-/// @param bbox: fixed point bbox
-/// *Note*: expects vX, vY to be in the correct precision for the type
-/// of rasterization. This avoids unnecessary FP->fixed conversions.
-template <typename SIMD_T, typename CT>
-INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T> (&vX)[3],
-                                       const Integer<SIMD_T> (&vY)[3],
-                                       SIMDBBOX_T<SIMD_T>& bbox)
-{
-    Integer<SIMD_T> vMinX = vX[0];
-
-    vMinX = SIMD_T::min_epi32(vMinX, vX[1]);
-    vMinX = SIMD_T::min_epi32(vMinX, vX[2]);
-
-    Integer<SIMD_T> vMaxX = vX[0];
-
-    vMaxX = SIMD_T::max_epi32(vMaxX, vX[1]);
-    vMaxX = SIMD_T::max_epi32(vMaxX, vX[2]);
-
-    Integer<SIMD_T> vMinY = vY[0];
-
-    vMinY = SIMD_T::min_epi32(vMinY, vY[1]);
-    vMinY = SIMD_T::min_epi32(vMinY, vY[2]);
-
-    Integer<SIMD_T> vMaxY = vY[0];
-
-    vMaxY = SIMD_T::max_epi32(vMaxY, vY[1]);
-    vMaxY = SIMD_T::max_epi32(vMaxY, vY[2]);
-
-    if (CT::BoundingBoxOffsetT::value != 0)
-    {
-        /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative
-        /// rasterization expand bbox by 1/256; coverage will be correctly handled in the
-        /// rasterizer.
-
-        const Integer<SIMD_T> value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value);
-
-        vMinX = SIMD_T::sub_epi32(vMinX, value);
-        vMaxX = SIMD_T::add_epi32(vMaxX, value);
-        vMinY = SIMD_T::sub_epi32(vMinY, value);
-        vMaxY = SIMD_T::add_epi32(vMaxY, value);
-    }
-
-    bbox.xmin = vMinX;
-    bbox.xmax = vMaxX;
-    bbox.ymin = vMinY;
-    bbox.ymax = vMaxY;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief  Gather scissor rect data based on per-prim viewport indices.
-/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
-/// @param pViewportIndex - array of per-primitive viewport indexes.
-/// @param scisXmin - output vector of per-primitive scissor rect Xmin data.
-/// @param scisYmin - output vector of per-primitive scissor rect Ymin data.
-/// @param scisXmax - output vector of per-primitive scissor rect Xmax data.
-/// @param scisYmax - output vector of per-primitive scissor rect Ymax data.
-//
-/// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint,
-                           const uint32_t* pViewportIndex,
-                           simdscalari&    scisXmin,
-                           simdscalari&    scisYmin,
-                           simdscalari&    scisXmax,
-                           simdscalari&    scisYmax)
-{
-    scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[5]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[0]].xmin);
-    scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[6]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[5]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[4]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[3]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[2]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[1]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[0]].ymin);
-    scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[6]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[5]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[4]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[3]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[2]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[1]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[0]].xmax);
-    scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[6]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[5]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[4]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[3]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[2]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[1]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[0]].ymax);
-}
-
-static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint,
-                           const uint32_t* pViewportIndex,
-                           simd16scalari&  scisXmin,
-                           simd16scalari&  scisYmin,
-                           simd16scalari&  scisXmax,
-                           simd16scalari&  scisYmax)
-{
-    scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[14]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[13]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[12]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[11]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[10]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[9]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[8]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[7]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[0]].xmin);
-
-    scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[14]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[13]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[12]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[11]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[10]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[9]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[8]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[7]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[0]].ymin);
-
-    scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[14]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[13]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[12]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[11]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[10]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[9]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[8]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[7]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[0]].xmax);
-
-    scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[14]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[13]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[12]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[11]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[10]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[9]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[8]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[7]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[0]].ymax);
-}
-\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h
deleted file mode 100644
index 7b2f77985f8..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/blend.h
+++ /dev/null
@@ -1,348 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file blend.cpp
- *
- * @brief Implementation for blending operations.
- *
- ******************************************************************************/
-#include "state.h"
-
-template <bool Color, bool Alpha>
-INLINE void GenerateBlendFactor(SWR_BLEND_FACTOR func,
-                                simdvector&      constantColor,
-                                simdvector&      src,
-                                simdvector&      src1,
-                                simdvector&      dst,
-                                simdvector&      out)
-{
-    simdvector result;
-
-    switch (func)
-    {
-    case BLENDFACTOR_ZERO:
-        result.x = _simd_setzero_ps();
-        result.y = _simd_setzero_ps();
-        result.z = _simd_setzero_ps();
-        result.w = _simd_setzero_ps();
-        break;
-
-    case BLENDFACTOR_ONE:
-        result.x = _simd_set1_ps(1.0);
-        result.y = _simd_set1_ps(1.0);
-        result.z = _simd_set1_ps(1.0);
-        result.w = _simd_set1_ps(1.0);
-        break;
-
-    case BLENDFACTOR_SRC_COLOR:
-        result = src;
-        break;
-
-    case BLENDFACTOR_DST_COLOR:
-        result = dst;
-        break;
-
-    case BLENDFACTOR_INV_SRC_COLOR:
-        result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x);
-        result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y);
-        result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z);
-        result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
-        break;
-
-    case BLENDFACTOR_INV_DST_COLOR:
-        result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x);
-        result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y);
-        result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z);
-        result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
-        break;
-
-    case BLENDFACTOR_SRC_ALPHA:
-        result.x = src.w;
-        result.y = src.w;
-        result.z = src.w;
-        result.w = src.w;
-        break;
-
-    case BLENDFACTOR_INV_SRC_ALPHA:
-    {
-        simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
-        result.x                = oneMinusSrcA;
-        result.y                = oneMinusSrcA;
-        result.z                = oneMinusSrcA;
-        result.w                = oneMinusSrcA;
-        break;
-    }
-
-    case BLENDFACTOR_DST_ALPHA:
-        result.x = dst.w;
-        result.y = dst.w;
-        result.z = dst.w;
-        result.w = dst.w;
-        break;
-
-    case BLENDFACTOR_INV_DST_ALPHA:
-    {
-        simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
-        result.x                = oneMinusDstA;
-        result.y                = oneMinusDstA;
-        result.z                = oneMinusDstA;
-        result.w                = oneMinusDstA;
-        break;
-    }
-
-    case BLENDFACTOR_SRC_ALPHA_SATURATE:
-    {
-        simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w));
-        result.x       = sat;
-        result.y       = sat;
-        result.z       = sat;
-        result.w       = _simd_set1_ps(1.0);
-        break;
-    }
-
-    case BLENDFACTOR_CONST_COLOR:
-        result.x = constantColor[0];
-        result.y = constantColor[1];
-        result.z = constantColor[2];
-        result.w = constantColor[3];
-        break;
-
-    case BLENDFACTOR_CONST_ALPHA:
-        result.x = result.y = result.z = result.w = constantColor[3];
-        break;
-
-    case BLENDFACTOR_INV_CONST_COLOR:
-    {
-        result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]);
-        result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]);
-        result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]);
-        result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
-        break;
-    }
-
-    case BLENDFACTOR_INV_CONST_ALPHA:
-    {
-        result.x = result.y = result.z = result.w =
-            _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
-        break;
-    }
-
-    case BLENDFACTOR_SRC1_COLOR:
-        result.x = src1.x;
-        result.y = src1.y;
-        result.z = src1.z;
-        result.w = src1.w;
-        break;
-
-    case BLENDFACTOR_SRC1_ALPHA:
-        result.x = result.y = result.z = result.w = src1.w;
-        break;
-
-    case BLENDFACTOR_INV_SRC1_COLOR:
-        result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x);
-        result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y);
-        result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z);
-        result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
-        break;
-
-    case BLENDFACTOR_INV_SRC1_ALPHA:
-        result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
-        break;
-
-    default:
-        SWR_INVALID("Unimplemented blend factor: %d", func);
-    }
-
-    if (Color)
-    {
-        out.x = result.x;
-        out.y = result.y;
-        out.z = result.z;
-    }
-    if (Alpha)
-    {
-        out.w = result.w;
-    }
-}
-
-template <bool Color, bool Alpha>
-INLINE void BlendFunc(SWR_BLEND_OP blendOp,
-                      simdvector&  src,
-                      simdvector&  srcFactor,
-                      simdvector&  dst,
-                      simdvector&  dstFactor,
-                      simdvector&  out)
-{
-    simdvector result;
-
-    switch (blendOp)
-    {
-    case BLENDOP_ADD:
-        result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
-        result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
-        result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
-        result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
-        break;
-
-    case BLENDOP_SUBTRACT:
-        result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
-        result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
-        result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
-        result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
-        break;
-
-    case BLENDOP_REVSUBTRACT:
-        result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x));
-        result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y));
-        result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z));
-        result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w));
-        break;
-
-    case BLENDOP_MIN:
-        result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
-        result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
-        result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
-        result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
-        break;
-
-    case BLENDOP_MAX:
-        result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
-        result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
-        result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
-        result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
-        break;
-
-    default:
-        SWR_INVALID("Unimplemented blend function: %d", blendOp);
-    }
-
-    if (Color)
-    {
-        out.x = result.x;
-        out.y = result.y;
-        out.z = result.z;
-    }
-    if (Alpha)
-    {
-        out.w = result.w;
-    }
-}
-
-template <SWR_TYPE type>
-INLINE void Clamp(simdvector& src)
-{
-    switch (type)
-    {
-    case SWR_TYPE_FLOAT:
-        break;
-
-    case SWR_TYPE_UNORM:
-        src.x = _simd_max_ps(src.x, _simd_setzero_ps());
-        src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
-
-        src.y = _simd_max_ps(src.y, _simd_setzero_ps());
-        src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
-
-        src.z = _simd_max_ps(src.z, _simd_setzero_ps());
-        src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
-
-        src.w = _simd_max_ps(src.w, _simd_setzero_ps());
-        src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
-        break;
-
-    case SWR_TYPE_SNORM:
-        src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f));
-        src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
-
-        src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f));
-        src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
-
-        src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f));
-        src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
-
-        src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f));
-        src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
-        break;
-
-    default:
-        SWR_INVALID("Unimplemented clamp: %d", type);
-        break;
-    }
-}
-
-template <SWR_TYPE type>
-void Blend(const SWR_BLEND_STATE*               pBlendState,
-           const SWR_RENDER_TARGET_BLEND_STATE* pState,
-           simdvector&                          src,
-           simdvector&                          src1,
-           uint8_t*                             pDst,
-           simdvector&                          result)
-{
-    // load render target
-    simdvector dst;
-    LoadSOA<KNOB_COLOR_HOT_TILE_FORMAT>(pDst, dst);
-
-    simdvector constColor;
-    constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]);
-    constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]);
-    constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]);
-    constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]);
-
-    // clamp src/dst/constant
-    Clamp<type>(src);
-    Clamp<type>(src1);
-    Clamp<type>(dst);
-    Clamp<type>(constColor);
-
-    simdvector srcFactor, dstFactor;
-    if (pBlendState->independentAlphaBlendEnable)
-    {
-        GenerateBlendFactor<true, false>(
-            (SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
-        GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor,
-                                         constColor,
-                                         src,
-                                         src1,
-                                         dst,
-                                         srcFactor);
-
-        GenerateBlendFactor<true, false>(
-            (SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
-        GenerateBlendFactor<false, true>(
-            (SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor);
-
-        BlendFunc<true, false>(
-            (SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-        BlendFunc<false, true>(
-            (SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
-    }
-    else
-    {
-        GenerateBlendFactor<true, true>(
-            (SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
-        GenerateBlendFactor<true, true>(
-            (SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
-
-        BlendFunc<true, true>(
-            (SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
deleted file mode 100644
index c399caf239b..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file clip.cpp
- *
- * @brief Implementation for clipping
- *
- ******************************************************************************/
-
-#include <assert.h>
-
-#include "common/os.h"
-#include "core/clip.h"
-
-float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
-{
-    return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
-}
-
-template <SWR_CLIPCODES ClippingPlane>
-inline void intersect(
-    int          s,          // index to first edge vertex v0 in pInPts.
-    int          p,          // index to second edge vertex v1 in pInPts.
-    const float* pInPts,     // array of all the input positions.
-    const float* pInAttribs, // array of all attributes for all vertex. All the attributes for each
-                             // vertex is contiguous.
-    int    numInAttribs,     // number of attributes per vertex.
-    int    i,                // output index.
-    float* pOutPts,     // array of output positions. We'll write our new intersection point at i*4.
-    float* pOutAttribs) // array of output attributes. We'll write our new attributes at
-                        // i*numInAttribs.
-{
-    float t;
-
-    // Find the parameter of the intersection.
-    //        t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc.
-    const float* v1 = &pInPts[s * 4];
-    const float* v2 = &pInPts[p * 4];
-
-    switch (ClippingPlane)
-    {
-    case FRUSTUM_LEFT:
-        t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]);
-        break;
-    case FRUSTUM_RIGHT:
-        t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]);
-        break;
-    case FRUSTUM_TOP:
-        t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]);
-        break;
-    case FRUSTUM_BOTTOM:
-        t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]);
-        break;
-    case FRUSTUM_NEAR:
-        t = ComputeInterpFactor(v1[2], v2[2]);
-        break;
-    case FRUSTUM_FAR:
-        t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]);
-        break;
-    default:
-        SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
-    };
-
-    const float* a1 = &pInAttribs[s * numInAttribs];
-    const float* a2 = &pInAttribs[p * numInAttribs];
-
-    float* pOutP = &pOutPts[i * 4];
-    float* pOutA = &pOutAttribs[i * numInAttribs];
-
-    // Interpolate new position.
-    for (int j = 0; j < 4; ++j)
-    {
-        pOutP[j] = v1[j] + (v2[j] - v1[j]) * t;
-    }
-
-    // Interpolate Attributes
-    for (int attr = 0; attr < numInAttribs; ++attr)
-    {
-        pOutA[attr] = a1[attr] + (a2[attr] - a1[attr]) * t;
-    }
-}
-
-// Checks whether vertex v lies inside clipping plane
-// in homogenous coords check -w < {x,y,z} < w;
-//
-template <SWR_CLIPCODES ClippingPlane>
-inline int inside(const float v[4])
-{
-    switch (ClippingPlane)
-    {
-    case FRUSTUM_LEFT:
-        return (v[0] >= -v[3]);
-    case FRUSTUM_RIGHT:
-        return (v[0] <= v[3]);
-    case FRUSTUM_TOP:
-        return (v[1] >= -v[3]);
-    case FRUSTUM_BOTTOM:
-        return (v[1] <= v[3]);
-    case FRUSTUM_NEAR:
-        return (v[2] >= 0.0f);
-    case FRUSTUM_FAR:
-        return (v[2] <= v[3]);
-    default:
-        SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
-        return 0;
-    }
-}
-
-// Clips a polygon in homogenous coordinates to a particular clipping plane.
-// Takes in vertices of the polygon (InPts) and the clipping plane
-// Puts the vertices of the clipped polygon in OutPts
-// Returns number of points in clipped polygon
-//
-template <SWR_CLIPCODES ClippingPlane>
-int ClipTriToPlane(const float* pInPts,
-                   int          numInPts,
-                   const float* pInAttribs,
-                   int          numInAttribs,
-                   float*       pOutPts,
-                   float*       pOutAttribs)
-{
-    int i = 0; // index number of OutPts, # of vertices in OutPts = i div 4;
-
-    for (int j = 0; j < numInPts; ++j)
-    {
-        int s = j;
-        int p = (j + 1) % numInPts;
-
-        int s_in = inside<ClippingPlane>(&pInPts[s * 4]);
-        int p_in = inside<ClippingPlane>(&pInPts[p * 4]);
-
-        // test if vertex is to be added to output vertices
-        if (s_in != p_in) // edge crosses clipping plane
-        {
-            // find point of intersection
-            intersect<ClippingPlane>(
-                s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs);
-            i++;
-        }
-        if (p_in) // 2nd vertex is inside clipping volume, add it to output
-        {
-            // Copy 2nd vertex position of edge over to output.
-            for (int k = 0; k < 4; ++k)
-            {
-                pOutPts[i * 4 + k] = pInPts[p * 4 + k];
-            }
-            // Copy 2nd vertex attributes of edge over to output.
-            for (int attr = 0; attr < numInAttribs; ++attr)
-            {
-                pOutAttribs[i * numInAttribs + attr] = pInAttribs[p * numInAttribs + attr];
-            }
-            i++;
-        }
-        // edge does not cross clipping plane and vertex outside clipping volume
-        //  => do not add vertex
-    }
-    return i;
-}
-
-void ClipRectangles(DRAW_CONTEXT*      pDC,
-                    PA_STATE&          pa,
-                    uint32_t           workerId,
-                    simdvector         prims[],
-                    uint32_t           primMask,
-                    simdscalari const& primId,
-                    simdscalari const& viewportIdx,
-                    simdscalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipRectangles, pDC->drawId);
-    Clipper<SIMD256, 3> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipRectangles, 1);
-}
-
-void ClipTriangles(DRAW_CONTEXT*      pDC,
-                   PA_STATE&          pa,
-                   uint32_t           workerId,
-                   simdvector         prims[],
-                   uint32_t           primMask,
-                   simdscalari const& primId,
-                   simdscalari const& viewportIdx,
-                   simdscalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipTriangles, pDC->drawId);
-    Clipper<SIMD256, 3> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipTriangles, 1);
-}
-
-void ClipLines(DRAW_CONTEXT*      pDC,
-               PA_STATE&          pa,
-               uint32_t           workerId,
-               simdvector         prims[],
-               uint32_t           primMask,
-               simdscalari const& primId,
-               simdscalari const& viewportIdx,
-               simdscalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipLines, pDC->drawId);
-    Clipper<SIMD256, 2> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipLines, 1);
-}
-
-void ClipPoints(DRAW_CONTEXT*      pDC,
-                PA_STATE&          pa,
-                uint32_t           workerId,
-                simdvector         prims[],
-                uint32_t           primMask,
-                simdscalari const& primId,
-                simdscalari const& viewportIdx,
-                simdscalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipPoints, pDC->drawId);
-    Clipper<SIMD256, 1> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipPoints, 1);
-}
-
-#if USE_SIMD16_FRONTEND
-void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT*        pDC,
-                                    PA_STATE&            pa,
-                                    uint32_t             workerId,
-                                    simd16vector         prims[],
-                                    uint32_t             primMask,
-                                    simd16scalari const& primId,
-                                    simd16scalari const& viewportIdx,
-                                    simd16scalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipRectangles, pDC->drawId);
-
-    enum
-    {
-        VERTS_PER_PRIM = 3
-    };
-
-    Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
-
-    pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipRectangles, 1);
-}
-
-void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT*        pDC,
-                                   PA_STATE&            pa,
-                                   uint32_t             workerId,
-                                   simd16vector         prims[],
-                                   uint32_t             primMask,
-                                   simd16scalari const& primId,
-                                   simd16scalari const& viewportIdx,
-                                   simd16scalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipTriangles, pDC->drawId);
-
-    enum
-    {
-        VERTS_PER_PRIM = 3
-    };
-
-    Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
-
-    pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipTriangles, 1);
-}
-
-void SIMDCALL ClipLines_simd16(DRAW_CONTEXT*        pDC,
-                               PA_STATE&            pa,
-                               uint32_t             workerId,
-                               simd16vector         prims[],
-                               uint32_t             primMask,
-                               simd16scalari const& primId,
-                               simd16scalari const& viewportIdx,
-                               simd16scalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipLines, pDC->drawId);
-
-    enum
-    {
-        VERTS_PER_PRIM = 2
-    };
-
-    Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
-
-    pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipLines, 1);
-}
-
-void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT*        pDC,
-                                PA_STATE&            pa,
-                                uint32_t             workerId,
-                                simd16vector         prims[],
-                                uint32_t             primMask,
-                                simd16scalari const& primId,
-                                simd16scalari const& viewportIdx,
-                                simd16scalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipPoints, pDC->drawId);
-
-    enum
-    {
-        VERTS_PER_PRIM = 1
-    };
-
-    Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
-
-    pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipPoints, 1);
-}
-
-#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
deleted file mode 100644
index d7186ca10b1..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ /dev/null
@@ -1,1361 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file clip.h
- *
- * @brief Definitions for clipping
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/simdintrin.h"
-#include "core/context.h"
-#include "core/pa.h"
-#include "rdtsc_core.h"
-
-enum SWR_CLIPCODES
-{
-// Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
-// Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union,
-// rather than intersection, of clipcodes.
-#define CLIPCODE_SHIFT 23
-    FRUSTUM_LEFT   = (0x01 << CLIPCODE_SHIFT),
-    FRUSTUM_TOP    = (0x02 << CLIPCODE_SHIFT),
-    FRUSTUM_RIGHT  = (0x04 << CLIPCODE_SHIFT),
-    FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
-
-    FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
-    FRUSTUM_FAR  = (0x20 << CLIPCODE_SHIFT),
-
-    NEGW = (0x40 << CLIPCODE_SHIFT),
-
-    GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
-    GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
-    GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
-    GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
-};
-
-#define GUARDBAND_CLIP_MASK                                                          \
-    (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \
-     GUARDBAND_BOTTOM | NEGW)
-#define FRUSTUM_CLIP_MASK \
-    (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM)
-
-template <typename SIMD_T>
-void ComputeClipCodes(const API_STATE&       state,
-                      const Vec4<SIMD_T>&    vertex,
-                      Float<SIMD_T>&         clipCodes,
-                      Integer<SIMD_T> const& viewportIndexes)
-{
-    clipCodes = SIMD_T::setzero_ps();
-
-    // -w
-    Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w, SIMD_T::set1_ps(-1.0f));
-
-    // FRUSTUM_LEFT
-    Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
-    clipCodes          = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
-
-    // FRUSTUM_TOP
-    vRes      = SIMD_T::cmplt_ps(vertex.y, vNegW);
-    clipCodes = SIMD_T::or_ps(
-        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
-
-    // FRUSTUM_RIGHT
-    vRes      = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
-    clipCodes = SIMD_T::or_ps(
-        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
-
-    // FRUSTUM_BOTTOM
-    vRes      = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
-    clipCodes = SIMD_T::or_ps(
-        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
-
-    if (state.rastState.depthClipEnable)
-    {
-        // FRUSTUM_NEAR
-        // DX clips depth [0..w], GL clips [-w..w]
-        if (state.rastState.clipHalfZ)
-        {
-            vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
-        }
-        else
-        {
-            vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
-        }
-        clipCodes = SIMD_T::or_ps(
-            clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
-
-        // FRUSTUM_FAR
-        vRes      = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
-        clipCodes = SIMD_T::or_ps(
-            clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
-    }
-
-    // NEGW
-    vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
-    clipCodes =
-        SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
-
-    // GUARDBAND_LEFT
-    Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW,
-                                          SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
-                                              &state.gbState.left[0], viewportIndexes));
-    vRes                 = SIMD_T::cmplt_ps(vertex.x, gbMult);
-    clipCodes            = SIMD_T::or_ps(
-        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
-
-    // GUARDBAND_TOP
-    gbMult    = SIMD_T::mul_ps(vNegW,
-                            SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
-                                &state.gbState.top[0], viewportIndexes));
-    vRes      = SIMD_T::cmplt_ps(vertex.y, gbMult);
-    clipCodes = SIMD_T::or_ps(
-        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
-
-    // GUARDBAND_RIGHT
-    gbMult    = SIMD_T::mul_ps(vertex.w,
-                            SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
-                                &state.gbState.right[0], viewportIndexes));
-    vRes      = SIMD_T::cmpgt_ps(vertex.x, gbMult);
-    clipCodes = SIMD_T::or_ps(
-        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
-
-    // GUARDBAND_BOTTOM
-    gbMult    = SIMD_T::mul_ps(vertex.w,
-                            SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
-                                &state.gbState.bottom[0], viewportIndexes));
-    vRes      = SIMD_T::cmpgt_ps(vertex.y, gbMult);
-    clipCodes = SIMD_T::or_ps(
-        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
-}
-
-template <typename SIMD_T>
-struct BinnerChooser
-{
-};
-
-template <>
-struct BinnerChooser<SIMD256>
-{
-    PFN_PROCESS_PRIMS pfnBinFunc;
-
-    BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
-        :
-        pfnBinFunc(nullptr)
-    {
-        if (numVertsPerPrim == 3)
-        {
-            pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
-
-        }
-        else if (numVertsPerPrim == 2)
-        {
-            pfnBinFunc = BinLines;
-        }
-        else
-        {
-            SWR_ASSERT(0 && "Unexpected points in clipper.");
-        }
-    }
-
-    BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
-        :
-        pfnBinFunc(nullptr)
-    {
-        switch (topology)
-        {
-        case TOP_POINT_LIST:
-            pfnBinFunc = BinPoints;
-            break;
-        case TOP_LINE_LIST:
-        case TOP_LINE_STRIP:
-        case TOP_LINE_LOOP:
-        case TOP_LINE_LIST_ADJ:
-        case TOP_LISTSTRIP_ADJ:
-            pfnBinFunc = BinLines;
-            break;
-        default:
-            pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
-            break;
-        };
-    }
-
-    void BinFunc(DRAW_CONTEXT*           pDC,
-                 PA_STATE&               pa,
-                 uint32_t                workerId,
-                 SIMD256::Vec4           prims[],
-                 uint32_t                primMask,
-                 SIMD256::Integer const& primID,
-                 SIMD256::Integer&       viewportIdx,
-                 SIMD256::Integer&       rtIdx)
-    {
-        SWR_ASSERT(pfnBinFunc != nullptr);
-
-        pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
-    }
-};
-
-#if USE_SIMD16_FRONTEND
-template <>
-struct BinnerChooser<SIMD512>
-{
-    PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
-
-    BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
-        :
-        pfnBinFunc(nullptr)
-    {
-        if (numVertsPerPrim == 3)
-        {
-            pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
-
-        }
-        else if (numVertsPerPrim == 2)
-        {
-            pfnBinFunc = BinLines_simd16;
-        }
-        else
-        {
-            SWR_ASSERT(0 && "Unexpected points in clipper.");
-        }
-    }
-
-    BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
-        :
-        pfnBinFunc(nullptr)
-    {
-        switch (topology)
-        {
-        case TOP_POINT_LIST:
-            pfnBinFunc = BinPoints_simd16;
-            break;
-        case TOP_LINE_LIST:
-        case TOP_LINE_STRIP:
-        case TOP_LINE_LOOP:
-        case TOP_LINE_LIST_ADJ:
-        case TOP_LISTSTRIP_ADJ:
-            pfnBinFunc = BinLines_simd16;
-            break;
-        default:
-            pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
-            break;
-        };
-    }
-
-    void BinFunc(DRAW_CONTEXT*           pDC,
-                 PA_STATE&               pa,
-                 uint32_t                workerId,
-                 SIMD512::Vec4           prims[],
-                 uint32_t                primMask,
-                 SIMD512::Integer const& primID,
-                 SIMD512::Integer&       viewportIdx,
-                 SIMD512::Integer&       rtIdx)
-    {
-        SWR_ASSERT(pfnBinFunc != nullptr);
-
-        pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
-    }
-};
-
-#endif
-template <typename SIMD_T>
-struct SimdHelper
-{
-};
-
-template <>
-struct SimdHelper<SIMD256>
-{
-    static SIMD256::Float insert_lo_ps(SIMD256::Float a) { return a; }
-
-    static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
-    {
-        return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
-    }
-};
-
-#if USE_SIMD16_FRONTEND
-template <>
-struct SimdHelper<SIMD512>
-{
-    static SIMD512::Float insert_lo_ps(SIMD256::Float a)
-    {
-        return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
-    }
-
-    static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
-    {
-        return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
-    }
-};
-#endif
-
-template <typename SIMD_T, uint32_t NumVertsPerPrimT>
-class Clipper
-{
-public:
-    INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
-        workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
-    {
-        static_assert(NumVertsPerPrimT >= 1 && NumVertsPerPrimT <= 3, "Invalid NumVertsPerPrim");
-        THREAD_DATA &thread_data = in_pDC->pContext->threadPool.pThreadData[workerId];
-
-        if (thread_data.clipperData == nullptr)
-        {
-            // 7 vertex temp data
-            // 7 post-clipped vertices
-            // 2 transposed verts for binning
-            size_t alloc_size = sizeof(SIMDVERTEX_T<SIMD_T>) * (7 + 7 + 2);
-            thread_data.clipperData = AlignedMalloc(alloc_size, KNOB_SIMD16_BYTES);
-        }
-        SWR_ASSERT(thread_data.clipperData);
-
-        this->clippedVerts = (SIMDVERTEX_T<SIMD_T>*)thread_data.clipperData;
-        this->tmpVerts = this->clippedVerts + 7;
-        this->transposedVerts = this->tmpVerts + 7;
-    }
-
-    void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes)
-    {
-        for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
-        {
-            ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
-        }
-    }
-
-    Float<SIMD_T> ComputeClipCodeIntersection()
-    {
-        Float<SIMD_T> result = clipCodes[0];
-
-        for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
-        {
-            result = SIMD_T::and_ps(result, clipCodes[i]);
-        }
-
-        return result;
-    }
-
-    Float<SIMD_T> ComputeClipCodeUnion()
-    {
-        Float<SIMD_T> result = clipCodes[0];
-
-        for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
-        {
-            result = SIMD_T::or_ps(result, clipCodes[i]);
-        }
-
-        return result;
-    }
-
-    int ComputeClipMask()
-    {
-        Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
-
-        clipUnion =
-            SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
-
-        return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
-    }
-
-    // clipper is responsible for culling any prims with NAN coordinates
-    int ComputeNaNMask(Vec4<SIMD_T> prim[])
-    {
-        Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
-
-        for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
-        {
-            Float<SIMD_T> vNan01 =
-                SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
-            vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
-
-            Float<SIMD_T> vNan23 =
-                SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
-            vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
-        }
-
-        return SIMD_T::movemask_ps(vNanMask);
-    }
-
-    int ComputeUserClipCullMask(PA_STATE& pa, Vec4<SIMD_T> prim[])
-    {
-        uint8_t  cullMask             = state.backendState.cullDistanceMask;
-        uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
-
-        Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
-
-        Vec4<SIMD_T> vClipCullDistLo[3];
-        Vec4<SIMD_T> vClipCullDistHi[3];
-
-        pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
-        pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
-
-        unsigned long index;
-        while (_BitScanForward(&index, cullMask))
-        {
-            cullMask &= ~(1 << index);
-            uint32_t slot      = index >> 2;
-            uint32_t component = index & 0x3;
-
-            Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
-            for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
-            {
-                Float<SIMD_T> vCullComp;
-                if (slot == 0)
-                {
-                    vCullComp = vClipCullDistLo[e][component];
-                }
-                else
-                {
-                    vCullComp = vClipCullDistHi[e][component];
-                }
-
-                // cull if cull distance < 0 || NAN
-                Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
-                    SIMD_T::setzero_ps(), vCullComp);
-                vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
-            }
-            vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
-        }
-
-        // clipper should also discard any primitive with NAN clip distance
-        uint8_t clipMask = state.backendState.clipDistanceMask;
-        while (_BitScanForward(&index, clipMask))
-        {
-            clipMask &= ~(1 << index);
-            uint32_t slot      = index >> 2;
-            uint32_t component = index & 0x3;
-
-            Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
-            for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
-            {
-                Float<SIMD_T> vClipComp;
-                if (slot == 0)
-                {
-                    vClipComp = vClipCullDistLo[e][component];
-                }
-                else
-                {
-                    vClipComp = vClipCullDistHi[e][component];
-                }
-
-                Float<SIMD_T> vClip =
-                    SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
-                Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
-                    SIMD_T::setzero_ps(), vClipComp);
-                vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
-                vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
-            }
-            vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
-        }
-
-        return SIMD_T::movemask_ps(vClipCullMask);
-    }
-
-    void ClipSimd(const Vec4<SIMD_T>     prim[],
-                  const Float<SIMD_T>&   vPrimMask,
-                  const Float<SIMD_T>&   vClipMask,
-                  PA_STATE&              pa,
-                  const Integer<SIMD_T>& vPrimId,
-                  const Integer<SIMD_T>& vViewportIdx,
-                  const Integer<SIMD_T>& vRtIdx)
-    {
-        // input/output vertex store for clipper
-        SIMDVERTEX_T<SIMD_T>* vertices = this->clippedVerts;
-
-        uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
-        uint32_t provokingVertex    = 0;
-        if (pa.binTopology == TOP_TRIANGLE_FAN)
-        {
-            provokingVertex = state.frontendState.provokingVertex.triFan;
-        }
-        ///@todo: line topology for wireframe?
-
-        // assemble pos
-        Vec4<SIMD_T> tmpVector[NumVertsPerPrimT];
-        for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
-        {
-            vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
-        }
-
-        // assemble attribs
-        const SWR_BACKEND_STATE& backendState = state.backendState;
-
-        int32_t maxSlot = -1;
-        for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
-        {
-            // Compute absolute attrib slot in vertex array
-            uint32_t mapSlot =
-                backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
-            maxSlot            = std::max<int32_t>(maxSlot, mapSlot);
-            uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
-
-            pa.Assemble(inputSlot, tmpVector);
-
-            // if constant interpolation enabled for this attribute, assign the provoking
-            // vertex values to all edges
-            if (CheckBit(constantInterpMask, slot))
-            {
-                for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
-                {
-                    vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
-                }
-            }
-            else
-            {
-                for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
-                {
-                    vertices[i].attrib[inputSlot] = tmpVector[i];
-                }
-            }
-        }
-
-        // assemble user clip distances if enabled
-        uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
-        if (state.backendState.clipDistanceMask & 0xf)
-        {
-            pa.Assemble(vertexClipCullSlot, tmpVector);
-            for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
-            {
-                vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
-            }
-        }
-
-        if (state.backendState.clipDistanceMask & 0xf0)
-        {
-            pa.Assemble(vertexClipCullSlot + 1, tmpVector);
-            for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
-            {
-                vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
-            }
-        }
-
-        uint32_t numAttribs = maxSlot + 1;
-
-        Integer<SIMD_T> vNumClippedVerts =
-            ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
-
-        BinnerChooser<SIMD_T> binner(NumVertsPerPrimT,
-                                     pa.pDC->pState->state.rastState.conservativeRast);
-
-        // set up new PA for binning clipped primitives
-        PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
-        if (NumVertsPerPrimT == 3)
-        {
-            clipTopology = TOP_TRIANGLE_FAN;
-
-            // so that the binner knows to bloat wide points later
-            if (pa.binTopology == TOP_POINT_LIST)
-            {
-                clipTopology = TOP_POINT_LIST;
-            }
-            else if (pa.binTopology == TOP_RECT_LIST)
-            {
-                clipTopology = TOP_RECT_LIST;
-            }
-        }
-        else if (NumVertsPerPrimT == 2)
-        {
-            clipTopology = TOP_LINE_LIST;
-        }
-        else
-        {
-            SWR_ASSERT(0 && "Unexpected points in clipper.");
-        }
-
-        const uint32_t* pVertexCount = reinterpret_cast<const uint32_t*>(&vNumClippedVerts);
-        const uint32_t* pPrimitiveId = reinterpret_cast<const uint32_t*>(&vPrimId);
-        const uint32_t* pViewportIdx = reinterpret_cast<const uint32_t*>(&vViewportIdx);
-        const uint32_t* pRtIdx       = reinterpret_cast<const uint32_t*>(&vRtIdx);
-
-        const SIMD256::Integer vOffsets =
-            SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
-                               6 * sizeof(SIMDVERTEX_T<SIMD_T>),
-                               5 * sizeof(SIMDVERTEX_T<SIMD_T>),
-                               4 * sizeof(SIMDVERTEX_T<SIMD_T>),
-                               3 * sizeof(SIMDVERTEX_T<SIMD_T>),
-                               2 * sizeof(SIMDVERTEX_T<SIMD_T>),
-                               1 * sizeof(SIMDVERTEX_T<SIMD_T>),
-                               0 * sizeof(SIMDVERTEX_T<SIMD_T>));
-
-        // only need to gather 7 verts
-        // @todo dynamic mask based on actual # of verts generated per lane
-        const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
-
-        uint32_t numClippedPrims = 0;
-
-        // transpose clipper output so that each lane's vertices are in SIMD order
-        // set aside space for 2 vertices, as the PA will try to read up to 16 verts
-        // for triangle fan
-        SIMDVERTEX_T<SIMD_T>*  transposedPrims = this->transposedVerts;
-
-        uint32_t              numInputPrims = pa.NumPrims();
-        for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
-        {
-            uint32_t numEmittedVerts = pVertexCount[inputPrim];
-            if (numEmittedVerts < NumVertsPerPrimT)
-            {
-                continue;
-            }
-            SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
-
-            uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
-            SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
-
-            numClippedPrims += numEmittedPrims;
-
-            // tranpose clipper output so that each lane's vertices are in SIMD order
-            // set aside space for 2 vertices, as the PA will try to read up to 16 verts
-            // for triangle fan
-
-            // transpose pos
-            float const* pBase =
-                reinterpret_cast<float const*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) +
-                inputPrim;
-
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                SIMD256::Float temp =
-                    SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
-                transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] =
-                    SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
-            }
-
-            // transpose attribs
-            pBase = reinterpret_cast<float const*>(
-                        &vertices[0].attrib[backendState.vertexAttribOffset]) +
-                    inputPrim;
-
-            for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
-            {
-                uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
-
-                for (uint32_t c = 0; c < 4; ++c)
-                {
-                    SIMD256::Float temp =
-                        SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
-                    transposedPrims[0].attrib[attribSlot][c] =
-                        SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                    pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
-                }
-            }
-
-            // transpose user clip distances if enabled
-            uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
-            if (state.backendState.clipDistanceMask & 0x0f)
-            {
-                pBase = reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot]) +
-                        inputPrim;
-
-                for (uint32_t c = 0; c < 4; ++c)
-                {
-                    SIMD256::Float temp =
-                        SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
-                    transposedPrims[0].attrib[vertexClipCullSlot][c] =
-                        SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                    pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
-                }
-            }
-
-            if (state.backendState.clipDistanceMask & 0xf0)
-            {
-                pBase =
-                    reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot + 1]) +
-                    inputPrim;
-
-                for (uint32_t c = 0; c < 4; ++c)
-                {
-                    SIMD256::Float temp =
-                        SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
-                    transposedPrims[0].attrib[vertexClipCullSlot + 1][c] =
-                        SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                    pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
-                }
-            }
-
-            PA_STATE_OPT clipPA(pDC,
-                                numEmittedPrims,
-                                reinterpret_cast<uint8_t*>(&transposedPrims[0]),
-                                numEmittedVerts,
-                                SWR_VTX_NUM_SLOTS,
-                                true,
-                                NumVertsPerPrimT,
-                                clipTopology);
-            clipPA.viewportArrayActive = pa.viewportArrayActive;
-            clipPA.rtArrayActive       = pa.rtArrayActive;
-
-            static const uint32_t primMaskMap[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f};
-
-            const uint32_t primMask = primMaskMap[numEmittedPrims];
-
-            const Integer<SIMD_T> primID      = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
-            const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
-            const Integer<SIMD_T> rtIdx       = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
-
-            while (clipPA.GetNextStreamOutput())
-            {
-                do
-                {
-                    Vec4<SIMD_T> attrib[NumVertsPerPrimT];
-
-                    bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
-
-                    if (assemble)
-                    {
-                        binner.pfnBinFunc(
-                            pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
-                    }
-
-                } while (clipPA.NextPrim());
-            }
-        }
-
-        // update global pipeline stat
-        UPDATE_STAT_FE(CPrimitives, numClippedPrims);
-    }
-
-    void ExecuteStage(PA_STATE&              pa,
-                      Vec4<SIMD_T>           prim[],
-                      uint32_t               primMask,
-                      Integer<SIMD_T> const& primId,
-                      Integer<SIMD_T> const& viewportIdx,
-                      Integer<SIMD_T> const& rtIdx)
-    {
-        SWR_ASSERT(pa.pDC != nullptr);
-
-        BinnerChooser<SIMD_T> binner(pa.binTopology,
-                                     pa.pDC->pState->state.rastState.conservativeRast);
-
-        // update clipper invocations pipeline stat
-        uint32_t numInvoc = _mm_popcnt_u32(primMask);
-        UPDATE_STAT_FE(CInvocations, numInvoc);
-
-        ComputeClipCodes(prim, viewportIdx);
-
-        // cull prims with NAN coords
-        primMask &= ~ComputeNaNMask(prim);
-
-        // user cull distance cull
-        if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask)
-        {
-            primMask &= ~ComputeUserClipCullMask(pa, prim);
-        }
-
-        Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
-        // Mask out non-frustum codes
-        clipIntersection = SIMD_T::and_ps(clipIntersection,
-                                          SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
-
-        // cull prims outside view frustum
-        int validMask =
-            primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
-
-        // skip clipping for points
-        uint32_t clipMask = 0;
-        if (NumVertsPerPrimT != 1)
-        {
-            clipMask = validMask & ComputeClipMask();
-        }
-
-        AR_EVENT(ClipInfoEvent(numInvoc, validMask, clipMask));
-
-        if (clipMask)
-        {
-            RDTSC_BEGIN(pa.pDC->pContext->pBucketMgr, FEGuardbandClip, pa.pDC->drawId);
-            // we have to clip tris, execute the clipper, which will also
-            // call the binner
-            ClipSimd(prim,
-                     SIMD_T::vmask_ps(validMask),
-                     SIMD_T::vmask_ps(clipMask),
-                     pa,
-                     primId,
-                     viewportIdx,
-                     rtIdx);
-            RDTSC_END(pa.pDC->pContext->pBucketMgr, FEGuardbandClip, 1);
-        }
-        else if (validMask)
-        {
-            // update CPrimitives pipeline state
-            UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
-
-            // forward valid prims directly to binner
-            binner.pfnBinFunc(
-                this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
-        }
-    }
-
-private:
-    Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const& boundaryCoord0,
-                                      Float<SIMD_T> const& boundaryCoord1)
-    {
-        return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
-    }
-
-    Integer<SIMD_T>
-    ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const& vIndices, uint32_t component)
-    {
-        const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
-        const uint32_t componentStride  = sizeof(Float<SIMD_T>);
-        const uint32_t attribStride     = sizeof(Vec4<SIMD_T>);
-
-        static const OSALIGNSIMD16(uint32_t) elemOffset[16] = {
-            0 * sizeof(float),
-            1 * sizeof(float),
-            2 * sizeof(float),
-            3 * sizeof(float),
-            4 * sizeof(float),
-            5 * sizeof(float),
-            6 * sizeof(float),
-            7 * sizeof(float),
-            8 * sizeof(float),
-            9 * sizeof(float),
-            10 * sizeof(float),
-            11 * sizeof(float),
-            12 * sizeof(float),
-            13 * sizeof(float),
-            14 * sizeof(float),
-            15 * sizeof(float),
-        };
-
-        static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset),
-                      "Clipper::ComputeOffsets, Increase number of element offsets.");
-
-        Integer<SIMD_T> vElemOffset =
-            SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T>*>(elemOffset));
-
-        // step to the simdvertex
-        Integer<SIMD_T> vOffsets =
-            SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
-
-        // step to the attribute and component
-        vOffsets = SIMD_T::add_epi32(
-            vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
-
-        // step to the lane
-        vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
-
-        return vOffsets;
-    }
-
-    Float<SIMD_T> GatherComponent(const float*           pBuffer,
-                                  uint32_t               attrib,
-                                  Float<SIMD_T> const&   vMask,
-                                  Integer<SIMD_T> const& vIndices,
-                                  uint32_t               component)
-    {
-        Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
-        Float<SIMD_T>   vSrc     = SIMD_T::setzero_ps();
-
-        return SIMD_T::mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask);
-    }
-
-    void ScatterComponent(const float*           pBuffer,
-                          uint32_t               attrib,
-                          Float<SIMD_T> const&   vMask,
-                          Integer<SIMD_T> const& vIndices,
-                          uint32_t               component,
-                          Float<SIMD_T> const&   vSrc)
-    {
-        Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
-
-        const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
-        const float*    pSrc     = reinterpret_cast<const float*>(&vSrc);
-        uint32_t        mask     = SIMD_T::movemask_ps(vMask);
-        unsigned long  lane;
-        while (_BitScanForward(&lane, mask))
-        {
-            mask &= ~(1 << lane);
-            const uint8_t* pBuf = reinterpret_cast<const uint8_t*>(pBuffer) + pOffsets[lane];
-            *(float*)pBuf       = pSrc[lane];
-        }
-    }
-
-    template <SWR_CLIPCODES ClippingPlane>
-    void intersect(const Float<SIMD_T>&   vActiveMask,  // active lanes to operate on
-                   const Integer<SIMD_T>& s,            // index to first edge vertex v0 in pInPts.
-                   const Integer<SIMD_T>& p,            // index to second edge vertex v1 in pInPts.
-                   const Vec4<SIMD_T>&    v1,           // vertex 0 position
-                   const Vec4<SIMD_T>&    v2,           // vertex 1 position
-                   Integer<SIMD_T>&       outIndex,     // output index.
-                   const float*           pInVerts,     // array of all the input positions.
-                   uint32_t               numInAttribs, // number of attributes per vertex.
-                   float* pOutVerts) // array of output positions. We'll write our new intersection
-                                     // point at i*4.
-    {
-        uint32_t vertexAttribOffset   = this->state.backendState.vertexAttribOffset;
-        uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
-
-        // compute interpolation factor
-        Float<SIMD_T> t;
-        switch (ClippingPlane)
-        {
-        case FRUSTUM_LEFT:
-            t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0]));
-            break;
-        case FRUSTUM_RIGHT:
-            t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0]));
-            break;
-        case FRUSTUM_TOP:
-            t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1]));
-            break;
-        case FRUSTUM_BOTTOM:
-            t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1]));
-            break;
-        case FRUSTUM_NEAR:
-            // DX Znear plane is 0, GL is -w
-            if (this->state.rastState.clipHalfZ)
-            {
-                t = ComputeInterpFactor(v1[2], v2[2]);
-            }
-            else
-            {
-                t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
-            }
-            break;
-        case FRUSTUM_FAR:
-            t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2]));
-            break;
-        default:
-            SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
-        };
-
-        // interpolate position and store
-        for (uint32_t c = 0; c < 4; ++c)
-        {
-            Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
-            ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
-        }
-
-        // interpolate attributes and store
-        for (uint32_t a = 0; a < numInAttribs; ++a)
-        {
-            uint32_t attribSlot = vertexAttribOffset + a;
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
-                Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
-                Float<SIMD_T> vOutAttrib =
-                    SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
-                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
-            }
-        }
-
-        // interpolate clip distance if enabled
-        if (this->state.backendState.clipDistanceMask & 0xf)
-        {
-            uint32_t attribSlot = vertexClipCullOffset;
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
-                Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
-                Float<SIMD_T> vOutAttrib =
-                    SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
-                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
-            }
-        }
-
-        if (this->state.backendState.clipDistanceMask & 0xf0)
-        {
-            uint32_t attribSlot = vertexClipCullOffset + 1;
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
-                Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
-                Float<SIMD_T> vOutAttrib =
-                    SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
-                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
-            }
-        }
-    }
-
-    template <SWR_CLIPCODES ClippingPlane>
-    Float<SIMD_T> inside(const Vec4<SIMD_T>& v)
-    {
-        switch (ClippingPlane)
-        {
-        case FRUSTUM_LEFT:
-            return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
-        case FRUSTUM_RIGHT:
-            return SIMD_T::cmple_ps(v[0], v[3]);
-        case FRUSTUM_TOP:
-            return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
-        case FRUSTUM_BOTTOM:
-            return SIMD_T::cmple_ps(v[1], v[3]);
-        case FRUSTUM_NEAR:
-            return SIMD_T::cmpge_ps(v[2],
-                                    this->state.rastState.clipHalfZ
-                                        ? SIMD_T::setzero_ps()
-                                        : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
-        case FRUSTUM_FAR:
-            return SIMD_T::cmple_ps(v[2], v[3]);
-        default:
-            SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
-            return SIMD_T::setzero_ps();
-        }
-    }
-
-    template <SWR_CLIPCODES ClippingPlane>
-    Integer<SIMD_T> ClipTriToPlane(const float*           pInVerts,
-                                   const Integer<SIMD_T>& vNumInPts,
-                                   uint32_t               numInAttribs,
-                                   float*                 pOutVerts)
-    {
-        uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
-
-        Integer<SIMD_T> vCurIndex   = SIMD_T::setzero_si();
-        Integer<SIMD_T> vOutIndex   = SIMD_T::setzero_si();
-        Float<SIMD_T>   vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
-
-        while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
-        {
-            Integer<SIMD_T> s             = vCurIndex;
-            Integer<SIMD_T> p             = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
-            Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
-            p                             = SIMD_T::castps_si(SIMD_T::blendv_ps(
-                SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
-
-            // gather position
-            Vec4<SIMD_T> vInPos0, vInPos1;
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
-                vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
-            }
-
-            // compute inside mask
-            Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
-            Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
-
-            // compute intersection mask (s_in != p_in)
-            Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
-            intersectMask               = SIMD_T::and_ps(intersectMask, vActiveMask);
-
-            // store s if inside
-            s_in = SIMD_T::and_ps(s_in, vActiveMask);
-            if (!SIMD_T::testz_ps(s_in, s_in))
-            {
-                // store position
-                for (uint32_t c = 0; c < 4; ++c)
-                {
-                    ScatterComponent(
-                        pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
-                }
-
-                // store attribs
-                for (uint32_t a = 0; a < numInAttribs; ++a)
-                {
-                    uint32_t attribSlot = vertexAttribOffset + a;
-                    for (uint32_t c = 0; c < 4; ++c)
-                    {
-                        Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
-                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
-                    }
-                }
-
-                // store clip distance if enabled
-                uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
-                if (this->state.backendState.clipDistanceMask & 0xf)
-                {
-                    uint32_t attribSlot = vertexClipCullSlot;
-                    for (uint32_t c = 0; c < 4; ++c)
-                    {
-                        Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
-                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
-                    }
-                }
-
-                if (this->state.backendState.clipDistanceMask & 0xf0)
-                {
-                    uint32_t attribSlot = vertexClipCullSlot + 1;
-                    for (uint32_t c = 0; c < 4; ++c)
-                    {
-                        Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
-                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
-                    }
-                }
-
-                // increment outIndex
-                vOutIndex = SIMD_T::blendv_epi32(
-                    vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
-            }
-
-            // compute and store intersection
-            if (!SIMD_T::testz_ps(intersectMask, intersectMask))
-            {
-                intersect<ClippingPlane>(intersectMask,
-                                         s,
-                                         p,
-                                         vInPos0,
-                                         vInPos1,
-                                         vOutIndex,
-                                         pInVerts,
-                                         numInAttribs,
-                                         pOutVerts);
-
-                // increment outIndex for active lanes
-                vOutIndex = SIMD_T::blendv_epi32(
-                    vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
-            }
-
-            // increment loop index and update active mask
-            vCurIndex   = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
-            vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
-        }
-
-        return vOutIndex;
-    }
-
-    template <SWR_CLIPCODES ClippingPlane>
-    Integer<SIMD_T> ClipLineToPlane(const float*           pInVerts,
-                                    const Integer<SIMD_T>& vNumInPts,
-                                    uint32_t               numInAttribs,
-                                    float*                 pOutVerts)
-    {
-        uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
-
-        Integer<SIMD_T> vCurIndex   = SIMD_T::setzero_si();
-        Integer<SIMD_T> vOutIndex   = SIMD_T::setzero_si();
-        Float<SIMD_T>   vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
-
-        if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
-        {
-            Integer<SIMD_T> s = vCurIndex;
-            Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
-
-            // gather position
-            Vec4<SIMD_T> vInPos0, vInPos1;
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
-                vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
-            }
-
-            // compute inside mask
-            Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
-            Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
-
-            // compute intersection mask (s_in != p_in)
-            Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
-            intersectMask               = SIMD_T::and_ps(intersectMask, vActiveMask);
-
-            // store s if inside
-            s_in = SIMD_T::and_ps(s_in, vActiveMask);
-            if (!SIMD_T::testz_ps(s_in, s_in))
-            {
-                for (uint32_t c = 0; c < 4; ++c)
-                {
-                    ScatterComponent(
-                        pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
-                }
-
-                // interpolate attributes and store
-                for (uint32_t a = 0; a < numInAttribs; ++a)
-                {
-                    uint32_t attribSlot = vertexAttribOffset + a;
-                    for (uint32_t c = 0; c < 4; ++c)
-                    {
-                        Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
-                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
-                    }
-                }
-
-                // increment outIndex
-                vOutIndex = SIMD_T::blendv_epi32(
-                    vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
-            }
-
-            // compute and store intersection
-            if (!SIMD_T::testz_ps(intersectMask, intersectMask))
-            {
-                intersect<ClippingPlane>(intersectMask,
-                                         s,
-                                         p,
-                                         vInPos0,
-                                         vInPos1,
-                                         vOutIndex,
-                                         pInVerts,
-                                         numInAttribs,
-                                         pOutVerts);
-
-                // increment outIndex for active lanes
-                vOutIndex = SIMD_T::blendv_epi32(
-                    vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
-            }
-
-            // store p if inside
-            p_in = SIMD_T::and_ps(p_in, vActiveMask);
-            if (!SIMD_T::testz_ps(p_in, p_in))
-            {
-                for (uint32_t c = 0; c < 4; ++c)
-                {
-                    ScatterComponent(
-                        pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
-                }
-
-                // interpolate attributes and store
-                for (uint32_t a = 0; a < numInAttribs; ++a)
-                {
-                    uint32_t attribSlot = vertexAttribOffset + a;
-                    for (uint32_t c = 0; c < 4; ++c)
-                    {
-                        Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
-                        ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
-                    }
-                }
-
-                // increment outIndex
-                vOutIndex = SIMD_T::blendv_epi32(
-                    vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
-            }
-        }
-
-        return vOutIndex;
-    }
-
-    Integer<SIMD_T> ClipPrims(float*               pVertices,
-                              const Float<SIMD_T>& vPrimMask,
-                              const Float<SIMD_T>& vClipMask,
-                              int                  numAttribs)
-    {
-        // temp storage
-        float* pTempVerts = reinterpret_cast<float*>(this->tmpVerts);
-
-        // zero out num input verts for non-active lanes
-        Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrimT);
-        vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
-
-        // clip prims to frustum
-        Integer<SIMD_T> vNumOutPts;
-        if (NumVertsPerPrimT == 3)
-        {
-            vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
-            vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-            vNumOutPts =
-                ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-            vNumOutPts =
-                ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-            vNumOutPts =
-                ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-            vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-        }
-        else
-        {
-            SWR_ASSERT(NumVertsPerPrimT == 2);
-            vNumOutPts =
-                ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
-            vNumOutPts =
-                ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-            vNumOutPts =
-                ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-            vNumOutPts =
-                ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-            vNumOutPts =
-                ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-            vNumOutPts =
-                ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-        }
-
-        // restore num verts for non-clipped, active lanes
-        Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
-        vNumOutPts =
-            SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrimT), vNonClippedMask);
-
-        return vNumOutPts;
-    }
-
-    const uint32_t   workerId{0};
-    DRAW_CONTEXT*    pDC{nullptr};
-    const API_STATE& state;
-    Float<SIMD_T>    clipCodes[NumVertsPerPrimT];
-    SIMDVERTEX_T<SIMD_T>* clippedVerts;
-    SIMDVERTEX_T<SIMD_T>* tmpVerts;
-    SIMDVERTEX_T<SIMD_T>* transposedVerts;
-};
-
-// pipeline stage functions
-void ClipRectangles(DRAW_CONTEXT*      pDC,
-                    PA_STATE&          pa,
-                    uint32_t           workerId,
-                    simdvector         prims[],
-                    uint32_t           primMask,
-                    simdscalari const& primId,
-                    simdscalari const& viewportIdx,
-                    simdscalari const& rtIdx);
-void ClipTriangles(DRAW_CONTEXT*      pDC,
-                   PA_STATE&          pa,
-                   uint32_t           workerId,
-                   simdvector         prims[],
-                   uint32_t           primMask,
-                   simdscalari const& primId,
-                   simdscalari const& viewportIdx,
-                   simdscalari const& rtIdx);
-void ClipLines(DRAW_CONTEXT*      pDC,
-               PA_STATE&          pa,
-               uint32_t           workerId,
-               simdvector         prims[],
-               uint32_t           primMask,
-               simdscalari const& primId,
-               simdscalari const& viewportIdx,
-               simdscalari const& rtIdx);
-void ClipPoints(DRAW_CONTEXT*      pDC,
-                PA_STATE&          pa,
-                uint32_t           workerId,
-                simdvector         prims[],
-                uint32_t           primMask,
-                simdscalari const& primId,
-                simdscalari const& viewportIdx,
-                simdscalari const& rtIdx);
-#if USE_SIMD16_FRONTEND
-void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT*        pDC,
-                                    PA_STATE&            pa,
-                                    uint32_t             workerId,
-                                    simd16vector         prims[],
-                                    uint32_t             primMask,
-                                    simd16scalari const& primId,
-                                    simd16scalari const& viewportIdx,
-                                    simd16scalari const& rtIdx);
-void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT*        pDC,
-                                   PA_STATE&            pa,
-                                   uint32_t             workerId,
-                                   simd16vector         prims[],
-                                   uint32_t             primMask,
-                                   simd16scalari const& primId,
-                                   simd16scalari const& viewportIdx,
-                                   simd16scalari const& rtIdx);
-void SIMDCALL ClipLines_simd16(DRAW_CONTEXT*        pDC,
-                               PA_STATE&            pa,
-                               uint32_t             workerId,
-                               simd16vector         prims[],
-                               uint32_t             primMask,
-                               simd16scalari const& primId,
-                               simd16scalari const& viewportIdx,
-                               simd16scalari const& rtIdx);
-void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT*        pDC,
-                                PA_STATE&            pa,
-                                uint32_t             workerId,
-                                simd16vector         prims[],
-                                uint32_t             primMask,
-                                simd16scalari const& primId,
-                                simd16scalari const& viewportIdx,
-                                simd16scalari const& rtIdx);
-#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h
deleted file mode 100644
index 9e7f96cdeac..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file conservativerast.h
- *
- ******************************************************************************/
-#pragma once
-#include <type_traits>
-#include "common/simdintrin.h"
-
-enum FixedPointFmt
-{
-    FP_UNINIT,
-    _16_8,
-    _16_9,
-    _X_16,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for supported Fixed Point precisions
-typedef std::integral_constant<uint32_t, FP_UNINIT> Fixed_Uninit;
-typedef std::integral_constant<uint32_t, _16_8>     Fixed_16_8;
-typedef std::integral_constant<uint32_t, _16_9>     Fixed_16_9;
-typedef std::integral_constant<uint32_t, _X_16>     Fixed_X_16;
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct FixedPointTraits
-/// @brief holds constants relating to converting between FP and Fixed point
-/// @tparam FT: fixed precision type
-template <typename FT>
-struct FixedPointTraits
-{
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Fixed_16_8 specialization of FixedPointTraits
-template <>
-struct FixedPointTraits<Fixed_16_8>
-{
-    /// multiplier to go from FP32 to Fixed Point 16.8
-    typedef std::integral_constant<uint32_t, 256> ScaleT;
-    /// number of bits to shift to go from 16.8 fixed => int32
-    typedef std::integral_constant<uint32_t, 8> BitsT;
-    typedef Fixed_16_8                          TypeT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Fixed_16_9 specialization of FixedPointTraits
-template <>
-struct FixedPointTraits<Fixed_16_9>
-{
-    /// multiplier to go from FP32 to Fixed Point 16.9
-    typedef std::integral_constant<uint32_t, 512> ScaleT;
-    /// number of bits to shift to go from 16.9 fixed => int32
-    typedef std::integral_constant<uint32_t, 9> BitsT;
-    typedef Fixed_16_9                          TypeT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Fixed_16_9 specialization of FixedPointTraits
-template <>
-struct FixedPointTraits<Fixed_X_16>
-{
-    /// multiplier to go from FP32 to Fixed Point X.16
-    typedef std::integral_constant<uint32_t, 65536> ScaleT;
-    /// number of bits to shift to go from X.16 fixed => int32
-    typedef std::integral_constant<uint32_t, 16> BitsT;
-    typedef Fixed_X_16                           TypeT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for conservative rasterization modes
-typedef std::false_type StandardRastT;
-typedef std::true_type  ConservativeRastT;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for Input Coverage rasterization modes
-typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NONE>   NoInputCoverageT;
-typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NORMAL> OuterConservativeCoverageT;
-typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
-    InnerConservativeCoverageT;
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct ConservativeRastTraits
-/// @brief primary ConservativeRastTraits template. Shouldn't be instantiated
-/// @tparam ConservativeT: type of conservative rasterization
-template <typename ConservativeT>
-struct ConservativeRastFETraits
-{
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief StandardRast specialization of ConservativeRastTraits
-template <>
-struct ConservativeRastFETraits<StandardRastT>
-{
-    typedef std::false_type                     IsConservativeT;
-    typedef std::integral_constant<uint32_t, 0> BoundingBoxOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ConservativeRastT specialization of ConservativeRastTraits
-template <>
-struct ConservativeRastFETraits<ConservativeRastT>
-{
-    typedef std::true_type                      IsConservativeT;
-    typedef std::integral_constant<uint32_t, 1> BoundingBoxOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for ConservativeRastFETraits
-typedef ConservativeRastFETraits<StandardRastT>     FEStandardRastT;
-typedef ConservativeRastFETraits<ConservativeRastT> FEConservativeRastT;
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct ConservativeRastBETraits
-/// @brief primary ConservativeRastBETraits template. Shouldn't be instantiated;
-/// default to standard rasterization behavior
-/// @tparam ConservativeT: type of conservative rasterization
-/// @tparam InputCoverageT: type of input coverage requested, if any
-template <typename ConservativeT, typename _InputCoverageT>
-struct ConservativeRastBETraits
-{
-    typedef std::false_type                    IsConservativeT;
-    typedef _InputCoverageT                    InputCoverageT;
-    typedef FixedPointTraits<Fixed_16_8>       ConservativePrecisionT;
-    typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
-    typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief StandardRastT specialization of ConservativeRastBETraits
-template <typename _InputCoverageT>
-struct ConservativeRastBETraits<StandardRastT, _InputCoverageT>
-{
-    typedef std::false_type                    IsConservativeT;
-    typedef _InputCoverageT                    InputCoverageT;
-    typedef FixedPointTraits<Fixed_16_8>       ConservativePrecisionT;
-    typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
-    typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ConservativeRastT specialization of ConservativeRastBETraits
-/// with no input coverage
-template <>
-struct ConservativeRastBETraits<ConservativeRastT, NoInputCoverageT>
-{
-    typedef std::true_type   IsConservativeT;
-    typedef NoInputCoverageT InputCoverageT;
-
-    typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
-
-    /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
-    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
-    /// of of having to compare individual edges to pixel corners to check if any part of the
-    /// triangle intersects a pixel
-    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
-                                               ConservativeEdgeOffsetT;
-    typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ConservativeRastT specialization of ConservativeRastBETraits
-/// with OuterConservativeCoverage
-template <>
-struct ConservativeRastBETraits<ConservativeRastT, OuterConservativeCoverageT>
-{
-    typedef std::true_type             IsConservativeT;
-    typedef OuterConservativeCoverageT InputCoverageT;
-
-    typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
-
-    /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
-    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
-    /// of of having to compare individual edges to pixel corners to check if any part of the
-    /// triangle intersects a pixel
-    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
-                                               ConservativeEdgeOffsetT;
-    typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ConservativeRastT specialization of ConservativeRastBETraits
-/// with InnerConservativeCoverage
-template <>
-struct ConservativeRastBETraits<ConservativeRastT, InnerConservativeCoverageT>
-{
-    typedef std::true_type             IsConservativeT;
-    typedef InnerConservativeCoverageT InputCoverageT;
-
-    typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
-
-    /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
-    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
-    /// of of having to compare individual edges to pixel corners to check if any part of the
-    /// triangle intersects a pixel
-    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
-        ConservativeEdgeOffsetT;
-
-    /// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel +
-    /// 1/512, in Fixed 16.9 precision this allows the rasterizer to do the 3 edge coverage tests
-    /// against a single point, instead of of having to compare individual edges to pixel corners to
-    /// check if a pixel is fully covered by a triangle
-    typedef std::integral_constant<int32_t,
-                                   static_cast<int32_t>(
-                                       -((ConservativePrecisionT::ScaleT::value / 2) + 1) -
-                                       ConservativeEdgeOffsetT::value)>
-        InnerConservativeEdgeOffsetT;
-};
-\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
deleted file mode 100644
index b874520b9d8..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ /dev/null
@@ -1,608 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file context.h
- *
- * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
- *        The SWR_CONTEXT is our global context and contains the DC ring,
- *        thread state, etc.
- *
- *        The DRAW_CONTEXT contains all state associated with a draw operation.
- *
- ******************************************************************************/
-#pragma once
-
-#include <condition_variable>
-#include <algorithm>
-
-#include "core/api.h"
-#include "core/utils.h"
-#include "core/arena.h"
-#include "core/fifo.hpp"
-#include "core/knobs.h"
-#include "common/intrin.h"
-#include "common/rdtsc_buckets.h"
-#include "core/threads.h"
-#include "ringbuffer.h"
-#include "archrast/archrast.h"
-
-// x.8 fixed point precision values
-#define FIXED_POINT_SHIFT 8
-#define FIXED_POINT_SCALE 256
-
-// x.16 fixed point precision values
-#define FIXED_POINT16_SHIFT 16
-#define FIXED_POINT16_SCALE 65536
-
-struct SWR_CONTEXT;
-struct DRAW_CONTEXT;
-
-struct TRI_FLAGS
-{
-    uint32_t frontFacing : 1;
-    uint32_t yMajor : 1;
-    uint32_t coverageMask : (SIMD_TILE_X_DIM* SIMD_TILE_Y_DIM);
-    uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
-    float    pointSize;
-    uint32_t renderTargetArrayIndex;
-    uint32_t viewportIndex;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_TRIANGLE_DESC
-/////////////////////////////////////////////////////////////////////////
-struct SWR_TRIANGLE_DESC
-{
-    float I[3];
-    float J[3];
-    float Z[3];
-    float OneOverW[3];
-    float recipDet;
-
-    float* pRecipW;
-    float* pAttribs;
-    float* pPerspAttribs;
-    float* pSamplePos;
-    float* pUserClipBuffer;
-
-    uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
-    uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if
-                                // entire pixel is covered
-    uint64_t anyCoveredSamples;
-
-    TRI_FLAGS triFlags;
-};
-
-struct TRIANGLE_WORK_DESC
-{
-    float* pTriBuffer;
-    float* pAttribs;
-    float* pUserClipBuffer;
-    uint32_t  numAttribs;
-    TRI_FLAGS triFlags;
-};
-
-struct CLEAR_DESC
-{
-    SWR_RECT rect;
-    uint32_t attachmentMask;
-    uint32_t renderTargetArrayIndex;
-    float    clearRTColor[4]; // RGBA_32F
-    float    clearDepth;      // [0..1]
-    uint8_t  clearStencil;
-};
-
-struct DISCARD_INVALIDATE_TILES_DESC
-{
-    uint32_t       attachmentMask;
-    SWR_RECT       rect;
-    SWR_TILE_STATE newTileState;
-    bool           createNewTiles;
-    bool           fullTilesOnly;
-};
-
-struct SYNC_DESC
-{
-    PFN_CALLBACK_FUNC pfnCallbackFunc;
-    uint64_t          userData;
-    uint64_t          userData2;
-    uint64_t          userData3;
-};
-
-struct STORE_TILES_DESC
-{
-    uint32_t       attachmentMask;
-    SWR_TILE_STATE postStoreTileState;
-    SWR_RECT       rect;
-};
-
-struct COMPUTE_DESC
-{
-    uint32_t threadGroupCountX;
-    uint32_t threadGroupCountY;
-    uint32_t threadGroupCountZ;
-    bool     enableThreadDispatch;
-};
-
-typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,
-                              uint32_t      workerId,
-                              uint32_t      macroTile,
-                              void*         pDesc);
-
-enum WORK_TYPE
-{
-    SYNC,
-    DRAW,
-    CLEAR,
-    DISCARDINVALIDATETILES,
-    STORETILES,
-    SHUTDOWN,
-};
-
-OSALIGNSIMD(struct) BE_WORK
-{
-    WORK_TYPE     type;
-    PFN_WORK_FUNC pfnWork;
-    union
-    {
-        SYNC_DESC                     sync;
-        TRIANGLE_WORK_DESC            tri;
-        CLEAR_DESC                    clear;
-        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
-        STORE_TILES_DESC              storeTiles;
-    } desc;
-};
-
-struct DRAW_WORK
-{
-    DRAW_CONTEXT* pDC;
-    union
-    {
-        uint32_t numIndices; // DrawIndexed: Number of indices for draw.
-        uint32_t numVerts;   // Draw: Number of verts (triangles, lines, etc)
-    };
-    union
-    {
-        gfxptr_t xpIB;        // DrawIndexed: App supplied int32 indices
-        uint32_t startVertex; // Draw: Starting vertex in VB to render from.
-    };
-    int32_t  baseVertex;
-    uint32_t numInstances;  // Number of instances
-    uint32_t startInstance; // Instance offset
-    uint32_t startPrimID;   // starting primitiveID for this draw batch
-    uint32_t
-               startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
-    SWR_FORMAT type;          // index buffer type
-};
-
-typedef void (*PFN_FE_WORK_FUNC)(SWR_CONTEXT*  pContext,
-                                 DRAW_CONTEXT* pDC,
-                                 uint32_t      workerId,
-                                 void*         pDesc);
-struct FE_WORK
-{
-    WORK_TYPE        type;
-    PFN_FE_WORK_FUNC pfnWork;
-    union
-    {
-        SYNC_DESC                     sync;
-        DRAW_WORK                     draw;
-        CLEAR_DESC                    clear;
-        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
-        STORE_TILES_DESC              storeTiles;
-    } desc;
-};
-
-struct GUARDBANDS
-{
-    float left[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float right[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float top[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
-};
-
-struct PA_STATE;
-
-// function signature for pipeline stages that execute after primitive assembly
-typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT*      pDC,
-                                  PA_STATE&          pa,
-                                  uint32_t           workerId,
-                                  simdvector         prims[],
-                                  uint32_t           primMask,
-                                  simdscalari const& primID,
-                                  simdscalari const& viewportIdx,
-                                  simdscalari const& rtIdx);
-
-// function signature for pipeline stages that execute after primitive assembly
-typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT*        pDC,
-                                                 PA_STATE&            pa,
-                                                 uint32_t             workerId,
-                                                 simd16vector         prims[],
-                                                 uint32_t             primMask,
-                                                 simd16scalari const& primID,
-                                                 simd16scalari const& viewportIdx,
-                                                 simd16scalari const& rtIdx);
-
-OSALIGNLINE(struct) API_STATE
-{
-    // Vertex Buffers
-    SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
-
-    // GS - Geometry Shader State
-    SWR_GS_STATE gsState;
-    PFN_GS_FUNC  pfnGsFunc;
-
-    // FS - Fetch Shader State
-    PFN_FETCH_FUNC pfnFetchFunc;
-
-    // VS - Vertex Shader State
-    PFN_VERTEX_FUNC pfnVertexFunc;
-
-    // Index Buffer
-    SWR_INDEX_BUFFER_STATE indexBuffer;
-
-    // CS - Compute Shader
-    PFN_CS_FUNC pfnCsFunc;
-    uint32_t    totalThreadsInGroup;
-    uint32_t    totalSpillFillSize;
-    uint32_t    scratchSpaceSizePerWarp;
-    uint32_t    scratchSpaceNumWarps;
-
-    // FE - Frontend State
-    SWR_FRONTEND_STATE frontendState;
-
-    // SOS - Streamout Shader State
-    PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
-
-    // Streamout state
-    SWR_STREAMOUT_STATE          soState;
-    mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
-    mutable SWR_STREAMOUT_BUFFER soPausedBuffer[MAX_SO_STREAMS];
-
-    // Tessellation State
-    PFN_HS_FUNC  pfnHsFunc;
-    PFN_DS_FUNC  pfnDsFunc;
-    SWR_TS_STATE tsState;
-
-    // Number of attributes used by the frontend (vs, so, gs)
-    uint32_t feNumAttributes;
-
-    // RS - Rasterizer State
-    SWR_RASTSTATE rastState;
-    // floating point multisample offsets
-    float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
-
-    GUARDBANDS gbState;
-
-    SWR_VIEWPORT          vp[KNOB_NUM_VIEWPORTS_SCISSORS];
-    SWR_VIEWPORT_MATRICES vpMatrices;
-
-    SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
-    SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
-    bool     scissorsTileAligned;
-
-    bool               forceFront;
-    PRIMITIVE_TOPOLOGY topology;
-
-
-    // Backend state
-    OSALIGNLINE(SWR_BACKEND_STATE) backendState;
-
-    SWR_DEPTH_BOUNDS_STATE depthBoundsState;
-
-    // PS - Pixel shader state
-    SWR_PS_STATE psState;
-
-    SWR_DEPTH_STENCIL_STATE depthStencilState;
-
-    // OM - Output Merger State
-    SWR_BLEND_STATE    blendState;
-    PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
-
-    struct
-    {
-        uint32_t enableStatsFE : 1;        // Enable frontend pipeline stats
-        uint32_t enableStatsBE : 1;        // Enable backend pipeline stats
-        uint32_t colorHottileEnable : 8;   // Bitmask of enabled color hottiles
-        uint32_t depthHottileEnable : 1;   // Enable depth buffer hottile
-        uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile
-    };
-
-    PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
-};
-
-class MacroTileMgr;
-class DispatchQueue;
-class HOTTILE;
-
-struct RenderOutputBuffers
-{
-    uint8_t* pColor[SWR_NUM_RENDERTARGETS];
-    uint8_t* pDepth;
-    uint8_t* pStencil;
-
-    HOTTILE* pColorHotTile[SWR_NUM_RENDERTARGETS];
-    HOTTILE* pDepthHotTile;
-    HOTTILE* pStencilHotTile;
-};
-
-// Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
-struct BarycentricCoeffs
-{
-    simdscalar vIa;
-    simdscalar vIb;
-    simdscalar vIc;
-
-    simdscalar vJa;
-    simdscalar vJb;
-    simdscalar vJc;
-
-    simdscalar vZa;
-    simdscalar vZb;
-    simdscalar vZc;
-
-    simdscalar vRecipDet;
-
-    simdscalar vAOneOverW;
-    simdscalar vBOneOverW;
-    simdscalar vCOneOverW;
-};
-
-// pipeline function pointer types
-typedef void (*PFN_BACKEND_FUNC)(
-    DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
-typedef void (*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT&,
-                                  uint8_t* (&)[SWR_NUM_RENDERTARGETS],
-                                  uint32_t,
-                                  const SWR_BLEND_STATE*,
-                                  const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS],
-                                  simdscalar&,
-                                  simdscalar const&);
-typedef void (*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
-typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
-typedef void (*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&,
-                                               SWR_PS_CONTEXT&,
-                                               const uint64_t* const,
-                                               const uint32_t,
-                                               simdscalar const&,
-                                               simdscalar const&);
-
-struct BACKEND_FUNCS
-{
-    PFN_BACKEND_FUNC pfnBackend;
-};
-
-// Draw State
-struct DRAW_STATE
-{
-    API_STATE state;
-
-    void* pPrivateState; // Its required the driver sets this up for each draw.
-
-    // pipeline function pointers, filled in by API thread when setting up the draw
-    BACKEND_FUNCS     backendFuncs;
-    PFN_PROCESS_PRIMS pfnProcessPrims;
-#if USE_SIMD16_FRONTEND
-    PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
-#endif
-
-    CachingArena* pArena; // This should only be used by API thread.
-};
-
-struct DRAW_DYNAMIC_STATE
-{
-    void Reset(uint32_t numThreads)
-    {
-        SWR_STATS* pSavePtr = pStats;
-        memset(this, 0, sizeof(*this));
-        pStats = pSavePtr;
-        memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
-    }
-    ///@todo Currently assumes only a single FE can do stream output for a draw.
-    uint32_t SoWriteOffset[4];
-    bool     SoWriteOffsetDirty[4];
-
-    SWR_STATS_FE statsFE; // Only one FE thread per DC.
-    SWR_STATS*   pStats;
-    uint64_t     soPrims; // number of primitives written to StreamOut buffer
-};
-
-// Draw Context
-//    The api thread sets up a draw context that exists for the life of the draw.
-//    This draw context maintains all of the state needed for the draw operation.
-struct DRAW_CONTEXT
-{
-    SWR_CONTEXT* pContext;
-    union
-    {
-        MacroTileMgr*  pTileMgr;
-        DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
-    };
-    DRAW_STATE*   pState; // Read-only state. Core should not update this outside of API thread.
-    CachingArena* pArena;
-
-    uint32_t drawId;
-    bool     dependentFE;  // Frontend work is dependent on all previous FE
-    bool     dependent;    // Backend work is dependent on all previous BE
-    bool     isCompute;    // Is this DC a compute context?
-    bool     cleanupState; // True if this is the last draw using an entry in the state ring.
-
-    FE_WORK FeWork;
-
-    SYNC_DESC retireCallback; // Call this func when this DC is retired.
-
-    DRAW_DYNAMIC_STATE dynState;
-
-    volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
-    volatile OSALIGNLINE(uint32_t) FeLock;
-    volatile OSALIGNLINE(uint32_t) threadsDone;
-};
-
-static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
-
-INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
-{
-    SWR_ASSERT(pDC != nullptr);
-    SWR_ASSERT(pDC->pState != nullptr);
-
-    return pDC->pState->state;
-}
-
-INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
-{
-    SWR_ASSERT(pDC != nullptr);
-    SWR_ASSERT(pDC->pState != nullptr);
-
-    return pDC->pState->pPrivateState;
-}
-
-class HotTileMgr;
-
-struct SWR_CONTEXT
-{
-    // Draw Context Ring
-    //  Each draw needs its own state in order to support multiple draws in flight across multiple
-    //  threads. We maintain N draw contexts configured as a ring. The size of the ring limits the
-    //  maximum number of draws that can be in flight at any given time.
-    //
-    //  Description:
-    //  1. State - When an application first sets state we'll request a new draw context to use.
-    //     a. If there are no available draw contexts then we'll have to wait until one becomes
-    //     free. b. If one is available then set pCurDrawContext to point to it and mark it in use.
-    //     c. All state calls set state on pCurDrawContext.
-    //  2. Draw - Creates submits a work item that is associated with current draw context.
-    //     a. Set pPrevDrawContext = pCurDrawContext
-    //     b. Set pCurDrawContext to NULL.
-    //  3. State - When an applications sets state after draw
-    //     a. Same as step 1.
-    //     b. State is copied from prev draw context to current.
-    RingBuffer<DRAW_CONTEXT> dcRing;
-
-    DRAW_CONTEXT* pCurDrawContext;  // This points to DC entry in ring for an unsubmitted draw.
-    DRAW_CONTEXT* pPrevDrawContext; // This points to DC entry for the previous context submitted
-                                    // that we can copy state from.
-
-    MacroTileMgr*  pMacroTileManagerArray;
-    DispatchQueue* pDispatchQueueArray;
-
-    // Draw State Ring
-    //  When draw are very large (lots of primitives) then the API thread will break these up.
-    //  These split draws all have identical state. So instead of storing the state directly
-    //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
-    //  to reference a single entry in the DS ring.
-    RingBuffer<DRAW_STATE> dsRing;
-
-    uint32_t curStateId; // Current index to the next available entry in the DS ring.
-
-    uint32_t NumWorkerThreads;
-    uint32_t NumFEThreads;
-    uint32_t NumBEThreads;
-
-    THREAD_POOL              threadPool; // Thread pool associated with this context
-    SWR_THREADING_INFO       threadInfo;
-    SWR_API_THREADING_INFO   apiThreadInfo;
-    SWR_WORKER_PRIVATE_STATE workerPrivateState;
-
-    uint32_t MAX_DRAWS_IN_FLIGHT;
-
-    std::condition_variable FifosNotEmpty;
-    std::mutex              WaitLock;
-
-    uint32_t privateStateSize;
-
-    HotTileMgr* pHotTileMgr;
-
-    // Callback functions, passed in at create context time
-    PFN_LOAD_TILE                  pfnLoadTile;
-    PFN_STORE_TILE                 pfnStoreTile;
-    PFN_TRANSLATE_GFXPTR_FOR_READ  pfnTranslateGfxptrForRead;
-    PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
-    PFN_MAKE_GFXPTR                pfnMakeGfxPtr;
-    PFN_CREATE_MEMORY_CONTEXT      pfnCreateMemoryContext;
-    PFN_DESTROY_MEMORY_CONTEXT     pfnDestroyMemoryContext;
-    PFN_UPDATE_SO_WRITE_OFFSET     pfnUpdateSoWriteOffset;
-    PFN_UPDATE_STATS               pfnUpdateStats;
-    PFN_UPDATE_STATS_FE            pfnUpdateStatsFE;
-    PFN_UPDATE_STREAMOUT           pfnUpdateStreamOut;
-
-
-    // Global Stats
-    SWR_STATS* pStats;
-
-    // Scratch space for workers.
-    uint8_t** ppScratch;
-
-    volatile OSALIGNLINE(uint32_t) drawsOutstandingFE;
-
-    OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
-    uint32_t frameCount;
-
-    uint32_t lastFrameChecked;
-    uint64_t lastDrawChecked;
-    TileSet* pSingleThreadLockedTiles;
-
-    // ArchRast thread contexts.
-    HANDLE* pArContext;
-
-    // handle to external memory for worker data to create memory contexts
-    HANDLE hExternalMemory;
-
-    BucketManager *pBucketMgr;
-};
-
-#define UPDATE_STAT_BE(name, count)                   \
-    if (GetApiState(pDC).enableStatsBE)               \
-    {                                                 \
-        pDC->dynState.pStats[workerId].name += count; \
-    }
-#define UPDATE_STAT_FE(name, count)          \
-    if (GetApiState(pDC).enableStatsFE)      \
-    {                                        \
-        pDC->dynState.statsFE.name += count; \
-    }
-
-// ArchRast instrumentation framework
-#define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
-#define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
-
-#ifdef KNOB_ENABLE_RDTSC
-#define RDTSC_BEGIN(pBucketMgr, type, drawid) RDTSC_START(pBucketMgr, type)
-#define RDTSC_END(pBucketMgr, type, count) RDTSC_STOP(pBucketMgr, type, count, 0)
-#else
-#define RDTSC_BEGIN(pBucketMgr, type, drawid)
-#define RDTSC_END(pBucketMgr, type, count)
-#endif
-
-#ifdef KNOB_ENABLE_AR
-#define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
-#define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id)
-#else
-#define _AR_EVENT(ctx, event)
-#define _AR_FLUSH(ctx, id)
-#endif
-
-// Use these macros for api thread.
-#define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
-
-// Use these macros for worker threads.
-#define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
-#define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
deleted file mode 100644
index 54a3489205a..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file depthstencil.h
- *
- * @brief Implements depth/stencil functionality
- *
- ******************************************************************************/
-#pragma once
-#include "common/os.h"
-#include "format_conversion.h"
-
-INLINE
-void StencilOp(SWR_STENCILOP     op,
-               simdscalar const& mask,
-               simdscalar const& stencilRefps,
-               simdscalar&       stencilps)
-{
-    simdscalari stencil = _simd_castps_si(stencilps);
-
-    switch (op)
-    {
-    case STENCILOP_KEEP:
-        break;
-    case STENCILOP_ZERO:
-        stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask);
-        break;
-    case STENCILOP_REPLACE:
-        stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask);
-        break;
-    case STENCILOP_INCRSAT:
-    {
-        simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1));
-        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
-        break;
-    }
-    case STENCILOP_DECRSAT:
-    {
-        simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1));
-        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
-        break;
-    }
-    case STENCILOP_INCR:
-    {
-        simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1));
-        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
-        break;
-    }
-    case STENCILOP_DECR:
-    {
-        simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff));
-        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
-        break;
-    }
-    case STENCILOP_INVERT:
-    {
-        simdscalar stencilinvert =
-            _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps()));
-        stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask);
-        break;
-    }
-    default:
-        break;
-    }
-}
-
-template <SWR_FORMAT depthFormatT>
-simdscalar QuantizeDepth(simdscalar const& depth)
-{
-    SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0);
-    uint32_t depthBpc  = FormatTraits<depthFormatT>::GetBPC(0);
-
-    if (depthType == SWR_TYPE_FLOAT)
-    {
-        // assume only 32bit float depth supported
-        SWR_ASSERT(depthBpc == 32);
-
-        // matches shader precision, no quantizing needed
-        return depth;
-    }
-
-    // should be unorm depth if not float
-    SWR_ASSERT(depthType == SWR_TYPE_UNORM);
-
-    float      quantize = (float)((1 << depthBpc) - 1);
-    simdscalar result   = _simd_mul_ps(depth, _simd_set1_ps(quantize));
-    result              = _simd_add_ps(result, _simd_set1_ps(0.5f));
-    result              = _simd_round_ps(result, _MM_FROUND_TO_ZERO);
-
-    if (depthBpc > 16)
-    {
-        result = _simd_div_ps(result, _simd_set1_ps(quantize));
-    }
-    else
-    {
-        result = _simd_mul_ps(result, _simd_set1_ps(1.0f / quantize));
-    }
-
-    return result;
-}
-
-INLINE
-simdscalar DepthStencilTest(const API_STATE*  pState,
-                            bool              frontFacing,
-                            uint32_t          viewportIndex,
-                            simdscalar const& iZ,
-                            uint8_t*          pDepthBase,
-                            simdscalar const& coverageMask,
-                            uint8_t*          pStencilBase,
-                            simdscalar*       pStencilMask)
-{
-    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-    static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
-
-    const SWR_DEPTH_STENCIL_STATE* pDSState  = &pState->depthStencilState;
-    const SWR_VIEWPORT*            pViewport = &pState->vp[viewportIndex];
-
-    simdscalar depthResult = _simd_set1_ps(-1.0f);
-    simdscalar zbuf;
-
-    // clamp Z to viewport [minZ..maxZ]
-    simdscalar vMinZ   = _simd_broadcast_ss(&pViewport->minZ);
-    simdscalar vMaxZ   = _simd_broadcast_ss(&pViewport->maxZ);
-    simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
-
-    if (pDSState->depthTestEnable)
-    {
-        switch (pDSState->depthTestFunc)
-        {
-        case ZFUNC_NEVER:
-            depthResult = _simd_setzero_ps();
-            break;
-        case ZFUNC_ALWAYS:
-            break;
-        default:
-            zbuf = _simd_load_ps((const float*)pDepthBase);
-        }
-
-        switch (pDSState->depthTestFunc)
-        {
-        case ZFUNC_LE:
-            depthResult = _simd_cmple_ps(interpZ, zbuf);
-            break;
-        case ZFUNC_LT:
-            depthResult = _simd_cmplt_ps(interpZ, zbuf);
-            break;
-        case ZFUNC_GT:
-            depthResult = _simd_cmpgt_ps(interpZ, zbuf);
-            break;
-        case ZFUNC_GE:
-            depthResult = _simd_cmpge_ps(interpZ, zbuf);
-            break;
-        case ZFUNC_EQ:
-            depthResult = _simd_cmpeq_ps(interpZ, zbuf);
-            break;
-        case ZFUNC_NE:
-            depthResult = _simd_cmpneq_ps(interpZ, zbuf);
-            break;
-        }
-    }
-
-    simdscalar stencilMask = _simd_set1_ps(-1.0f);
-
-    if (pDSState->stencilTestEnable)
-    {
-        uint8_t  stencilRefValue;
-        uint32_t stencilTestFunc;
-        uint8_t  stencilTestMask;
-        if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
-        {
-            stencilRefValue = pDSState->stencilRefValue;
-            stencilTestFunc = pDSState->stencilTestFunc;
-            stencilTestMask = pDSState->stencilTestMask;
-        }
-        else
-        {
-            stencilRefValue = pDSState->backfaceStencilRefValue;
-            stencilTestFunc = pDSState->backfaceStencilTestFunc;
-            stencilTestMask = pDSState->backfaceStencilTestMask;
-        }
-
-        simdvector sbuf;
-        simdscalar stencilWithMask;
-        simdscalar stencilRef;
-        switch (stencilTestFunc)
-        {
-        case ZFUNC_NEVER:
-            stencilMask = _simd_setzero_ps();
-            break;
-        case ZFUNC_ALWAYS:
-            break;
-        default:
-            LoadSOA<R8_UINT>(pStencilBase, sbuf);
-
-            // apply stencil read mask
-            stencilWithMask = _simd_castsi_ps(
-                _simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask)));
-
-            // do stencil compare in float to avoid simd integer emulation in AVX1
-            stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask));
-
-            stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask));
-            break;
-        }
-
-        switch (stencilTestFunc)
-        {
-        case ZFUNC_LE:
-            stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask);
-            break;
-        case ZFUNC_LT:
-            stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask);
-            break;
-        case ZFUNC_GT:
-            stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask);
-            break;
-        case ZFUNC_GE:
-            stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask);
-            break;
-        case ZFUNC_EQ:
-            stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask);
-            break;
-        case ZFUNC_NE:
-            stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask);
-            break;
-        }
-    }
-
-    simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask);
-    depthWriteMask            = _simd_and_ps(depthWriteMask, coverageMask);
-
-    *pStencilMask = stencilMask;
-    return depthWriteMask;
-}
-
-INLINE
-void DepthStencilWrite(const SWR_VIEWPORT*            pViewport,
-                       const SWR_DEPTH_STENCIL_STATE* pDSState,
-                       bool                           frontFacing,
-                       simdscalar const&              iZ,
-                       uint8_t*                       pDepthBase,
-                       const simdscalar&              depthMask,
-                       const simdscalar&              coverageMask,
-                       uint8_t*                       pStencilBase,
-                       const simdscalar&              stencilMask)
-{
-    if (pDSState->depthWriteEnable)
-    {
-        // clamp Z to viewport [minZ..maxZ]
-        simdscalar vMinZ   = _simd_broadcast_ss(&pViewport->minZ);
-        simdscalar vMaxZ   = _simd_broadcast_ss(&pViewport->maxZ);
-        simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
-
-        simdscalar vMask = _simd_and_ps(depthMask, coverageMask);
-        _simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ);
-    }
-
-    if (pDSState->stencilWriteEnable)
-    {
-        simdvector sbuf;
-        LoadSOA<R8_UINT>(pStencilBase, sbuf);
-        simdscalar stencilbuf = sbuf.v[0];
-
-        uint8_t  stencilRefValue;
-        uint32_t stencilFailOp;
-        uint32_t stencilPassDepthPassOp;
-        uint32_t stencilPassDepthFailOp;
-        uint8_t  stencilWriteMask;
-        if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
-        {
-            stencilRefValue        = pDSState->stencilRefValue;
-            stencilFailOp          = pDSState->stencilFailOp;
-            stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp;
-            stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp;
-            stencilWriteMask       = pDSState->stencilWriteMask;
-        }
-        else
-        {
-            stencilRefValue        = pDSState->backfaceStencilRefValue;
-            stencilFailOp          = pDSState->backfaceStencilFailOp;
-            stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp;
-            stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp;
-            stencilWriteMask       = pDSState->backfaceStencilWriteMask;
-        }
-
-        simdscalar stencilps    = stencilbuf;
-        simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue));
-
-        simdscalar stencilFailMask          = _simd_andnot_ps(stencilMask, coverageMask);
-        simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask);
-        simdscalar stencilPassDepthFailMask =
-            _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1)));
-
-        simdscalar origStencil = stencilps;
-
-        StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps);
-        StencilOp((SWR_STENCILOP)stencilPassDepthFailOp,
-                  stencilPassDepthFailMask,
-                  stencilRefps,
-                  stencilps);
-        StencilOp((SWR_STENCILOP)stencilPassDepthPassOp,
-                  stencilPassDepthPassMask,
-                  stencilRefps,
-                  stencilps);
-
-        // apply stencil write mask
-        simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask);
-        stencilps              = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask));
-        stencilps =
-            _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps);
-
-        simdvector stencilResult;
-        stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask);
-        StoreSOA<R8_UINT>(stencilResult, pStencilBase);
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
deleted file mode 100644
index 9a9cc2635df..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file fifo.hpp
- *
- * @brief Definitions for our fifos used for thread communication.
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/os.h"
-#include "arena.h"
-
-#include <vector>
-#include <cassert>
-
-template <class T>
-struct QUEUE
-{
-    OSALIGNLINE(volatile uint32_t) mLock{0};
-    OSALIGNLINE(volatile uint32_t) mNumEntries{0};
-    std::vector<T*> mBlocks;
-    T*              mCurBlock{nullptr};
-    uint32_t        mHead{0};
-    uint32_t        mTail{0};
-    uint32_t        mCurBlockIdx{0};
-
-    // power of 2
-    static const uint32_t mBlockSizeShift = 6;
-    static const uint32_t mBlockSize      = 1 << mBlockSizeShift;
-
-    template <typename ArenaT>
-    void clear(ArenaT& arena)
-    {
-        mHead = 0;
-        mTail = 0;
-        mBlocks.clear();
-        T* pNewBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4);
-        mBlocks.push_back(pNewBlock);
-        mCurBlock    = pNewBlock;
-        mCurBlockIdx = 0;
-        mNumEntries  = 0;
-        mLock        = 0;
-    }
-
-    uint32_t getNumQueued() { return mNumEntries; }
-
-    bool tryLock()
-    {
-        if (mLock)
-        {
-            return false;
-        }
-
-        // try to lock the FIFO
-        long initial = InterlockedCompareExchange(&mLock, 1, 0);
-        return (initial == 0);
-    }
-
-    void unlock() { mLock = 0; }
-
-    T* peek()
-    {
-        if (mNumEntries == 0)
-        {
-            return nullptr;
-        }
-        uint32_t block = mHead >> mBlockSizeShift;
-        return &mBlocks[block][mHead & (mBlockSize - 1)];
-    }
-
-    void dequeue_noinc()
-    {
-        mHead++;
-        mNumEntries--;
-    }
-
-    template <typename ArenaT>
-    bool enqueue_try_nosync(ArenaT& arena, const T* entry)
-    {
-        const float* pSrc = (const float*)entry;
-        float*       pDst = (float*)&mCurBlock[mTail];
-
-        auto lambda = [&](int32_t i) {
-            __m256 vSrc = _mm256_load_ps(pSrc + i * KNOB_SIMD_WIDTH);
-            _mm256_stream_ps(pDst + i * KNOB_SIMD_WIDTH, vSrc);
-        };
-
-        const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH * 4);
-        static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
-                      "FIFO element size should be multiple of SIMD width.");
-
-        UnrollerL<0, numSimdLines, 1>::step(lambda);
-
-        mTail++;
-        if (mTail == mBlockSize)
-        {
-            if (++mCurBlockIdx < mBlocks.size())
-            {
-                mCurBlock = mBlocks[mCurBlockIdx];
-            }
-            else
-            {
-                T* newBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4);
-                SWR_ASSERT(newBlock);
-
-                mBlocks.push_back(newBlock);
-                mCurBlock = newBlock;
-            }
-
-            mTail = 0;
-        }
-
-        mNumEntries++;
-        return true;
-    }
-
-    void destroy() {}
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
deleted file mode 100644
index f1ea06c4978..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file format_conversion.h
- *
- * @brief API implementation
- *
- ******************************************************************************/
-#include "format_types.h"
-#include "format_traits.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Load SIMD packed pixels in SOA format and converts to
-///        SOA RGBA32_FLOAT format.
-/// @param pSrc - source data in SOA form
-/// @param dst - output data in SOA form
-template <typename SIMD_T, SWR_FORMAT SrcFormat>
-INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, Vec4<SIMD_T>& dst)
-{
-    // fast path for float32
-    if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
-        (FormatTraits<SrcFormat>::GetBPC(0) == 32))
-    {
-        auto lambda = [&](int comp)
-        {
-            Float<SIMD_T> vComp =
-                SIMD_T::load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(Float<SIMD_T>)));
-
-            dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
-        };
-
-        UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
-        return;
-    }
-
-    auto lambda = [&](int comp)
-    {
-        // load SIMD components
-        Float<SIMD_T> vComp;
-        FormatTraits<SrcFormat>::loadSOA(comp, pSrc, vComp);
-
-        // unpack
-        vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
-
-        // convert
-        if (FormatTraits<SrcFormat>::isNormalized(comp))
-        {
-            vComp = SIMD_T::cvtepi32_ps(SIMD_T::castps_si(vComp));
-            vComp = SIMD_T::mul_ps(vComp, SIMD_T::set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
-        }
-
-        dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
-
-        // is there a better way to get this from the SIMD traits?
-        const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
-
-        pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
-    };
-
-    UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
-}
-
-template <SWR_FORMAT SrcFormat>
-INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simdvector& dst)
-{
-    LoadSOA<SIMD256, SrcFormat>(pSrc, dst);
-}
-
-template <SWR_FORMAT SrcFormat>
-INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst)
-{
-    LoadSOA<SIMD512, SrcFormat>(pSrc, dst);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Clamps the given component based on the requirements on the
-///        Format template arg
-/// @param vComp - SIMD vector of floats
-/// @param Component - component
-template <typename SIMD_T, SWR_FORMAT Format>
-INLINE Float<SIMD_T> SIMDCALL Clamp(Float<SIMD_T> const& v, uint32_t Component)
-{
-    Float<SIMD_T> vComp = v;
-    if (Component >= 4 || Component < 0)
-    {
-	// Component shouldn't out of <0;3> range
-	assert(false);
-	return vComp;
-    }
-    if (FormatTraits<Format>::isNormalized(Component))
-    {
-        if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
-        {
-            vComp = SIMD_T::max_ps(vComp, SIMD_T::setzero_ps());
-        }
-
-        if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
-        {
-            vComp = SIMD_T::max_ps(vComp, SIMD_T::set1_ps(-1.0f));
-        }
-        vComp = SIMD_T::min_ps(vComp, SIMD_T::set1_ps(1.0f));
-    }
-    else if (FormatTraits<Format>::GetBPC(Component) < 32)
-    {
-        if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
-        {
-            int           iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
-            int           iMin = 0;
-            Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
-            vCompi = SIMD_T::max_epu32(vCompi, SIMD_T::set1_epi32(iMin));
-            vCompi = SIMD_T::min_epu32(vCompi, SIMD_T::set1_epi32(iMax));
-            vComp = SIMD_T::castsi_ps(vCompi);
-        }
-        else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
-        {
-            int           iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
-            int           iMin = -1 - iMax;
-            Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
-            vCompi = SIMD_T::max_epi32(vCompi, SIMD_T::set1_epi32(iMin));
-            vCompi = SIMD_T::min_epi32(vCompi, SIMD_T::set1_epi32(iMax));
-            vComp = SIMD_T::castsi_ps(vCompi);
-        }
-    }
-
-    return vComp;
-}
-
-template <SWR_FORMAT Format>
-INLINE simdscalar SIMDCALL Clamp(simdscalar const& v, uint32_t Component)
-{
-    return Clamp<SIMD256, Format>(v, Component);
-}
-
-template <SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component)
-{
-    return Clamp<SIMD512, Format>(v, Component);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Normalize the given component based on the requirements on the
-///        Format template arg
-/// @param vComp - SIMD vector of floats
-/// @param Component - component
-template <typename SIMD_T, SWR_FORMAT Format>
-INLINE Float<SIMD_T> SIMDCALL Normalize(Float<SIMD_T> const& vComp, uint32_t Component)
-{
-    Float<SIMD_T> r = vComp;
-    if (FormatTraits<Format>::isNormalized(Component))
-    {
-        r = SIMD_T::mul_ps(r, SIMD_T::set1_ps(FormatTraits<Format>::fromFloat(Component)));
-        r = SIMD_T::castsi_ps(SIMD_T::cvtps_epi32(r));
-    }
-    return r;
-}
-
-template <SWR_FORMAT Format>
-INLINE simdscalar SIMDCALL Normalize(simdscalar const& vComp, uint32_t Component)
-{
-    return Normalize<SIMD256, Format>(vComp, Component);
-}
-
-template <SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component)
-{
-    return Normalize<SIMD512, Format>(vComp, Component);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert and store simdvector of pixels in SOA
-///        RGBA32_FLOAT to SOA format
-/// @param src - source data in SOA form
-/// @param dst - output data in SOA form
-template <typename SIMD_T, SWR_FORMAT DstFormat>
-INLINE void SIMDCALL StoreSOA(const Vec4<SIMD_T>& src, uint8_t* pDst)
-{
-    // fast path for float32
-    if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
-        (FormatTraits<DstFormat>::GetBPC(0) == 32))
-    {
-        for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
-        {
-            Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
-
-            // Gamma-correct
-            if (FormatTraits<DstFormat>::isSRGB)
-            {
-                if (comp < 3) // Input format is always RGBA32_FLOAT.
-                {
-                    vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
-                }
-            }
-
-            SIMD_T::store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp);
-        }
-        return;
-    }
-
-    auto lambda = [&](int comp) {
-        Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
-
-        // Gamma-correct
-        if (FormatTraits<DstFormat>::isSRGB)
-        {
-            if (comp < 3) // Input format is always RGBA32_FLOAT.
-            {
-                vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
-            }
-        }
-
-        // clamp
-        vComp = Clamp<SIMD_T, DstFormat>(vComp, comp);
-
-        // normalize
-        vComp = Normalize<SIMD_T, DstFormat>(vComp, comp);
-
-        // pack
-        vComp = FormatTraits<DstFormat>::pack(comp, vComp);
-
-        // store
-        FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
-
-        // is there a better way to get this from the SIMD traits?
-        const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
-
-        pDst += (FormatTraits<DstFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
-    };
-
-    UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
-}
-
-template <SWR_FORMAT DstFormat>
-INLINE void SIMDCALL StoreSOA(const simdvector& src, uint8_t* pDst)
-{
-    StoreSOA<SIMD256, DstFormat>(src, pDst);
-}
-
-template <SWR_FORMAT DstFormat>
-INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
-{
-    StoreSOA<SIMD512, DstFormat>(src, pDst);
-}
-
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
deleted file mode 100644
index 97e7d56e48e..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/format_traits.h
+++ /dev/null
@@ -1,4046 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file format_traits.h
- *
- * @brief Format Traits.  auto-generated file
- *
- * DO NOT EDIT
- *
- ******************************************************************************/
-#pragma once
-
-#include "format_types.h"
-#include "format_utils.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatSwizzle - Component swizzle selects
-//////////////////////////////////////////////////////////////////////////
-template <uint32_t comp0 = 0, uint32_t comp1 = 0, uint32_t comp2 = 0, uint32_t comp3 = 0>
-struct FormatSwizzle
-{
-    // Return swizzle select for component.
-    INLINE static uint32_t swizzle(uint32_t c)
-    {
-        static const uint32_t s[4] = {comp0, comp1, comp2, comp3};
-        return s[c];
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits - Format traits
-//////////////////////////////////////////////////////////////////////////
-template <SWR_FORMAT format>
-struct FormatTraits : ComponentTraits<SWR_TYPE_UNKNOWN, 0>, FormatSwizzle<0>, Defaults<0, 0, 0, 0>
-{
-    static const uint32_t bpp{0};
-    static const uint32_t numComps{0};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32A32_FLOAT> - Format traits specialization for R32G32B32A32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32A32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT,
-                                                          32,
-                                                          SWR_TYPE_FLOAT,
-                                                          32,
-                                                          SWR_TYPE_FLOAT,
-                                                          32,
-                                                          SWR_TYPE_FLOAT,
-                                                          32>,
-                                          FormatSwizzle<0, 1, 2, 3>,
-                                          Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32_32    TransposeT;
-    typedef Format4<32, 32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32A32_SINT> - Format traits specialization for R32G32B32A32_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32A32_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32_32    TransposeT;
-    typedef Format4<32, 32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32A32_UINT> - Format traits specialization for R32G32B32A32_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32A32_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32_32    TransposeT;
-    typedef Format4<32, 32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R64G64_FLOAT> - Format traits specialization for R64G64_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R64G64_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>,
-                                    FormatSwizzle<0, 1>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose64_64  TransposeT;
-    typedef Format2<64, 64> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32X32_FLOAT> - Format traits specialization for R32G32B32X32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT,
-                                                          32,
-                                                          SWR_TYPE_FLOAT,
-                                                          32,
-                                                          SWR_TYPE_FLOAT,
-                                                          32,
-                                                          SWR_TYPE_UNUSED,
-                                                          32>,
-                                          FormatSwizzle<0, 1, 2, 3>,
-                                          Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32_32    TransposeT;
-    typedef Format4<32, 32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32A32_SSCALED> - Format traits specialization for R32G32B32A32_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32A32_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED,
-                                                            32,
-                                                            SWR_TYPE_SSCALED,
-                                                            32,
-                                                            SWR_TYPE_SSCALED,
-                                                            32,
-                                                            SWR_TYPE_SSCALED,
-                                                            32>,
-                                            FormatSwizzle<0, 1, 2, 3>,
-                                            Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32_32    TransposeT;
-    typedef Format4<32, 32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32A32_USCALED> - Format traits specialization for R32G32B32A32_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32A32_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
-                                                            32,
-                                                            SWR_TYPE_USCALED,
-                                                            32,
-                                                            SWR_TYPE_USCALED,
-                                                            32,
-                                                            SWR_TYPE_USCALED,
-                                                            32>,
-                                            FormatSwizzle<0, 1, 2, 3>,
-                                            Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32_32    TransposeT;
-    typedef Format4<32, 32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32A32_SFIXED> - Format traits specialization for R32G32B32A32_SFIXED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32A32_SFIXED> : ComponentTraits<SWR_TYPE_SFIXED,
-                                                           32,
-                                                           SWR_TYPE_SFIXED,
-                                                           32,
-                                                           SWR_TYPE_SFIXED,
-                                                           32,
-                                                           SWR_TYPE_SFIXED,
-                                                           32>,
-                                           FormatSwizzle<0, 1, 2, 3>,
-                                           Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32_32    TransposeT;
-    typedef Format4<32, 32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32_FLOAT> - Format traits specialization for R32G32B32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{96};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32   TransposeT;
-    typedef Format3<32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32_SINT> - Format traits specialization for R32G32B32_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{96};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32   TransposeT;
-    typedef Format3<32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32_UINT> - Format traits specialization for R32G32B32_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{96};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32   TransposeT;
-    typedef Format3<32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32_SSCALED> - Format traits specialization for R32G32B32_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32_SSCALED>
-    : ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{96};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32   TransposeT;
-    typedef Format3<32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32_USCALED> - Format traits specialization for R32G32B32_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32_USCALED>
-    : ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{96};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32   TransposeT;
-    typedef Format3<32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32B32_SFIXED> - Format traits specialization for R32G32B32_SFIXED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32B32_SFIXED>
-    : ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{96};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32_32   TransposeT;
-    typedef Format3<32, 32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16A16_UNORM> - Format traits specialization for R16G16B16A16_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16A16_UNORM> : ComponentTraits<SWR_TYPE_UNORM,
-                                                          16,
-                                                          SWR_TYPE_UNORM,
-                                                          16,
-                                                          SWR_TYPE_UNORM,
-                                                          16,
-                                                          SWR_TYPE_UNORM,
-                                                          16>,
-                                          FormatSwizzle<0, 1, 2, 3>,
-                                          Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16_16    TransposeT;
-    typedef Format4<16, 16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16A16_SNORM> - Format traits specialization for R16G16B16A16_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16A16_SNORM> : ComponentTraits<SWR_TYPE_SNORM,
-                                                          16,
-                                                          SWR_TYPE_SNORM,
-                                                          16,
-                                                          SWR_TYPE_SNORM,
-                                                          16,
-                                                          SWR_TYPE_SNORM,
-                                                          16>,
-                                          FormatSwizzle<0, 1, 2, 3>,
-                                          Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16_16    TransposeT;
-    typedef Format4<16, 16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16A16_SINT> - Format traits specialization for R16G16B16A16_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16A16_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16_16    TransposeT;
-    typedef Format4<16, 16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16A16_UINT> - Format traits specialization for R16G16B16A16_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16A16_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16_16    TransposeT;
-    typedef Format4<16, 16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16A16_FLOAT> - Format traits specialization for R16G16B16A16_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16A16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT,
-                                                          16,
-                                                          SWR_TYPE_FLOAT,
-                                                          16,
-                                                          SWR_TYPE_FLOAT,
-                                                          16,
-                                                          SWR_TYPE_FLOAT,
-                                                          16>,
-                                          FormatSwizzle<0, 1, 2, 3>,
-                                          Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16_16    TransposeT;
-    typedef Format4<16, 16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32_FLOAT> - Format traits specialization for R32G32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-                                    FormatSwizzle<0, 1>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32_SINT> - Format traits specialization for R32G32_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32_SINT> : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
-                                   FormatSwizzle<0, 1>,
-                                   Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32_UINT> - Format traits specialization for R32G32_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32_UINT> : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
-                                   FormatSwizzle<0, 1>,
-                                   Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32_FLOAT_X8X24_TYPELESS> - Format traits specialization for
-/// R32_FLOAT_X8X24_TYPELESS
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32_FLOAT_X8X24_TYPELESS>
-    : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
-      FormatSwizzle<0, 1>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<X32_TYPELESS_G8X24_UINT> - Format traits specialization for X32_TYPELESS_G8X24_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<X32_TYPELESS_G8X24_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UNUSED, 32>,
-      FormatSwizzle<0, 1>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L32A32_FLOAT> - Format traits specialization for L32A32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L32A32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-                                    FormatSwizzle<0, 3>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{1};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R64_FLOAT> - Format traits specialization for R64_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R64_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 64>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<64> TransposeT;
-    typedef Format1<64>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16X16_UNORM> - Format traits specialization for R16G16B16X16_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16X16_UNORM> : ComponentTraits<SWR_TYPE_UNORM,
-                                                          16,
-                                                          SWR_TYPE_UNORM,
-                                                          16,
-                                                          SWR_TYPE_UNORM,
-                                                          16,
-                                                          SWR_TYPE_UNUSED,
-                                                          16>,
-                                          FormatSwizzle<0, 1, 2, 3>,
-                                          Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16_16    TransposeT;
-    typedef Format4<16, 16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16X16_FLOAT> - Format traits specialization for R16G16B16X16_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16X16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT,
-                                                          16,
-                                                          SWR_TYPE_FLOAT,
-                                                          16,
-                                                          SWR_TYPE_FLOAT,
-                                                          16,
-                                                          SWR_TYPE_UNUSED,
-                                                          16>,
-                                          FormatSwizzle<0, 1, 2, 3>,
-                                          Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16_16    TransposeT;
-    typedef Format4<16, 16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L32X32_FLOAT> - Format traits specialization for L32X32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-                                    FormatSwizzle<0, 3>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<I32X32_FLOAT> - Format traits specialization for I32X32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<I32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-                                    FormatSwizzle<0, 3>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16A16_SSCALED> - Format traits specialization for R16G16B16A16_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16A16_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED,
-                                                            16,
-                                                            SWR_TYPE_SSCALED,
-                                                            16,
-                                                            SWR_TYPE_SSCALED,
-                                                            16,
-                                                            SWR_TYPE_SSCALED,
-                                                            16>,
-                                            FormatSwizzle<0, 1, 2, 3>,
-                                            Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16_16    TransposeT;
-    typedef Format4<16, 16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16A16_USCALED> - Format traits specialization for R16G16B16A16_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16A16_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
-                                                            16,
-                                                            SWR_TYPE_USCALED,
-                                                            16,
-                                                            SWR_TYPE_USCALED,
-                                                            16,
-                                                            SWR_TYPE_USCALED,
-                                                            16>,
-                                            FormatSwizzle<0, 1, 2, 3>,
-                                            Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16_16    TransposeT;
-    typedef Format4<16, 16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32_SSCALED> - Format traits specialization for R32G32_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
-                                      FormatSwizzle<0, 1>,
-                                      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32_USCALED> - Format traits specialization for R32G32_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
-                                      FormatSwizzle<0, 1>,
-                                      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32G32_SFIXED> - Format traits specialization for R32G32_SFIXED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32G32_SFIXED> : ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>,
-                                     FormatSwizzle<0, 1>,
-                                     Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose32_32  TransposeT;
-    typedef Format2<32, 32> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B8G8R8A8_UNORM> - Format traits specialization for B8G8R8A8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B8G8R8A8_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B8G8R8A8_UNORM_SRGB> - Format traits specialization for B8G8R8A8_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B8G8R8A8_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R10G10B10A2_UNORM> - Format traits specialization for R10G10B10A2_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R10G10B10A2_UNORM> : ComponentTraits<SWR_TYPE_UNORM,
-                                                         10,
-                                                         SWR_TYPE_UNORM,
-                                                         10,
-                                                         SWR_TYPE_UNORM,
-                                                         10,
-                                                         SWR_TYPE_UNORM,
-                                                         2>,
-                                         FormatSwizzle<0, 1, 2, 3>,
-                                         Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R10G10B10A2_UNORM_SRGB> - Format traits specialization for R10G10B10A2_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R10G10B10A2_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM,
-                                                              10,
-                                                              SWR_TYPE_UNORM,
-                                                              10,
-                                                              SWR_TYPE_UNORM,
-                                                              10,
-                                                              SWR_TYPE_UNORM,
-                                                              2>,
-                                              FormatSwizzle<0, 1, 2, 3>,
-                                              Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R10G10B10A2_UINT> - Format traits specialization for R10G10B10A2_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R10G10B10A2_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8A8_UNORM> - Format traits specialization for R8G8B8A8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8A8_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8A8_UNORM_SRGB> - Format traits specialization for R8G8B8A8_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8A8_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8A8_SNORM> - Format traits specialization for R8G8B8A8_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8A8_SNORM>
-    : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8A8_SINT> - Format traits specialization for R8G8B8A8_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8A8_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8A8_UINT> - Format traits specialization for R8G8B8A8_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8A8_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16_UNORM> - Format traits specialization for R16G16_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
-                                    FormatSwizzle<0, 1>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16  TransposeT;
-    typedef Format2<16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16_SNORM> - Format traits specialization for R16G16_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16_SNORM> : ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
-                                    FormatSwizzle<0, 1>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16  TransposeT;
-    typedef Format2<16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16_SINT> - Format traits specialization for R16G16_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16_SINT> : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
-                                   FormatSwizzle<0, 1>,
-                                   Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16  TransposeT;
-    typedef Format2<16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16_UINT> - Format traits specialization for R16G16_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16_UINT> : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
-                                   FormatSwizzle<0, 1>,
-                                   Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16  TransposeT;
-    typedef Format2<16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16_FLOAT> - Format traits specialization for R16G16_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
-                                    FormatSwizzle<0, 1>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16  TransposeT;
-    typedef Format2<16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B10G10R10A2_UNORM> - Format traits specialization for B10G10R10A2_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B10G10R10A2_UNORM> : ComponentTraits<SWR_TYPE_UNORM,
-                                                         10,
-                                                         SWR_TYPE_UNORM,
-                                                         10,
-                                                         SWR_TYPE_UNORM,
-                                                         10,
-                                                         SWR_TYPE_UNORM,
-                                                         2>,
-                                         FormatSwizzle<2, 1, 0, 3>,
-                                         Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B10G10R10A2_UNORM_SRGB> - Format traits specialization for B10G10R10A2_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B10G10R10A2_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM,
-                                                              10,
-                                                              SWR_TYPE_UNORM,
-                                                              10,
-                                                              SWR_TYPE_UNORM,
-                                                              10,
-                                                              SWR_TYPE_UNORM,
-                                                              2>,
-                                              FormatSwizzle<2, 1, 0, 3>,
-                                              Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R11G11B10_FLOAT> - Format traits specialization for R11G11B10_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R11G11B10_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 10>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose11_11_10   TransposeT;
-    typedef Format3<11, 11, 10> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R10G10B10_FLOAT_A2_UNORM> - Format traits specialization for
-/// R10G10B10_FLOAT_A2_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R10G10B10_FLOAT_A2_UNORM> : ComponentTraits<SWR_TYPE_FLOAT,
-                                                                10,
-                                                                SWR_TYPE_FLOAT,
-                                                                10,
-                                                                SWR_TYPE_FLOAT,
-                                                                10,
-                                                                SWR_TYPE_UNORM,
-                                                                2>,
-                                                FormatSwizzle<0, 1, 2, 3>,
-                                                Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32_SINT> - Format traits specialization for R32_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32_UINT> - Format traits specialization for R32_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32_FLOAT> - Format traits specialization for R32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R24_UNORM_X8_TYPELESS> - Format traits specialization for R24_UNORM_X8_TYPELESS
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R24_UNORM_X8_TYPELESS>
-    : ComponentTraits<SWR_TYPE_UNORM, 24>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<24>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<X24_TYPELESS_G8_UINT> - Format traits specialization for X24_TYPELESS_G8_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<X24_TYPELESS_G8_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 32>, FormatSwizzle<1>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L32_UNORM> - Format traits specialization for L32_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L32_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L16A16_UNORM> - Format traits specialization for L16A16_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L16A16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
-                                    FormatSwizzle<0, 3>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{1};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16  TransposeT;
-    typedef Format2<16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<I24X8_UNORM> - Format traits specialization for I24X8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<I24X8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>,
-                                   FormatSwizzle<0, 3>,
-                                   Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose24_8  TransposeT;
-    typedef Format2<24, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L24X8_UNORM> - Format traits specialization for L24X8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L24X8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>,
-                                   FormatSwizzle<0, 3>,
-                                   Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose24_8  TransposeT;
-    typedef Format2<24, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<I32_FLOAT> - Format traits specialization for I32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<I32_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L32_FLOAT> - Format traits specialization for L32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L32_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<A32_FLOAT> - Format traits specialization for A32_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<A32_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B8G8R8X8_UNORM> - Format traits specialization for B8G8R8X8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B8G8R8X8_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B8G8R8X8_UNORM_SRGB> - Format traits specialization for B8G8R8X8_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B8G8R8X8_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8X8_UNORM> - Format traits specialization for R8G8B8X8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8X8_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8X8_UNORM_SRGB> - Format traits specialization for R8G8B8X8_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8X8_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R9G9B9E5_SHAREDEXP> - Format traits specialization for R9G9B9E5_SHAREDEXP
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R9G9B9E5_SHAREDEXP>
-    : ComponentTraits<SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 5>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose9_9_9_5    TransposeT;
-    typedef Format4<9, 9, 9, 5> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B10G10R10X2_UNORM> - Format traits specialization for B10G10R10X2_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B10G10R10X2_UNORM> : ComponentTraits<SWR_TYPE_UNORM,
-                                                         10,
-                                                         SWR_TYPE_UNORM,
-                                                         10,
-                                                         SWR_TYPE_UNORM,
-                                                         10,
-                                                         SWR_TYPE_UNUSED,
-                                                         2>,
-                                         FormatSwizzle<2, 1, 0, 3>,
-                                         Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L16A16_FLOAT> - Format traits specialization for L16A16_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L16A16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
-                                    FormatSwizzle<0, 3>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{1};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16  TransposeT;
-    typedef Format2<16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R10G10B10X2_USCALED> - Format traits specialization for R10G10B10X2_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R10G10B10X2_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
-                                                           10,
-                                                           SWR_TYPE_USCALED,
-                                                           10,
-                                                           SWR_TYPE_USCALED,
-                                                           10,
-                                                           SWR_TYPE_UNUSED,
-                                                           2>,
-                                           FormatSwizzle<0, 1, 2, 3>,
-                                           Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8A8_SSCALED> - Format traits specialization for R8G8B8A8_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8A8_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED,
-                                                        8,
-                                                        SWR_TYPE_SSCALED,
-                                                        8,
-                                                        SWR_TYPE_SSCALED,
-                                                        8,
-                                                        SWR_TYPE_SSCALED,
-                                                        8>,
-                                        FormatSwizzle<0, 1, 2, 3>,
-                                        Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8A8_USCALED> - Format traits specialization for R8G8B8A8_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8A8_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
-                                                        8,
-                                                        SWR_TYPE_USCALED,
-                                                        8,
-                                                        SWR_TYPE_USCALED,
-                                                        8,
-                                                        SWR_TYPE_USCALED,
-                                                        8>,
-                                        FormatSwizzle<0, 1, 2, 3>,
-                                        Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16_SSCALED> - Format traits specialization for R16G16_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
-                                      FormatSwizzle<0, 1>,
-                                      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16  TransposeT;
-    typedef Format2<16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16_USCALED> - Format traits specialization for R16G16_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
-                                      FormatSwizzle<0, 1>,
-                                      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16  TransposeT;
-    typedef Format2<16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32_SSCALED> - Format traits specialization for R32_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32_SSCALED>
-    : ComponentTraits<SWR_TYPE_SSCALED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32_USCALED> - Format traits specialization for R32_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32_USCALED>
-    : ComponentTraits<SWR_TYPE_USCALED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B5G6R5_UNORM> - Format traits specialization for B5G6R5_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B5G6R5_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
-      FormatSwizzle<2, 1, 0>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose5_6_5   TransposeT;
-    typedef Format3<5, 6, 5> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B5G6R5_UNORM_SRGB> - Format traits specialization for B5G6R5_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B5G6R5_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
-      FormatSwizzle<2, 1, 0>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose5_6_5   TransposeT;
-    typedef Format3<5, 6, 5> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B5G5R5A1_UNORM> - Format traits specialization for B5G5R5A1_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B5G5R5A1_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose5_5_5_1    TransposeT;
-    typedef Format4<5, 5, 5, 1> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B5G5R5A1_UNORM_SRGB> - Format traits specialization for B5G5R5A1_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B5G5R5A1_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose5_5_5_1    TransposeT;
-    typedef Format4<5, 5, 5, 1> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B4G4R4A4_UNORM> - Format traits specialization for B4G4R4A4_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B4G4R4A4_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose4_4_4_4    TransposeT;
-    typedef Format4<4, 4, 4, 4> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B4G4R4A4_UNORM_SRGB> - Format traits specialization for B4G4R4A4_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B4G4R4A4_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose4_4_4_4    TransposeT;
-    typedef Format4<4, 4, 4, 4> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8_UNORM> - Format traits specialization for R8G8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-                                  FormatSwizzle<0, 1>,
-                                  Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8  TransposeT;
-    typedef Format2<8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8_SNORM> - Format traits specialization for R8G8_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8_SNORM> : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
-                                  FormatSwizzle<0, 1>,
-                                  Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8  TransposeT;
-    typedef Format2<8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8_SINT> - Format traits specialization for R8G8_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8_SINT> : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-                                 FormatSwizzle<0, 1>,
-                                 Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8  TransposeT;
-    typedef Format2<8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8_UINT> - Format traits specialization for R8G8_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8_UINT> : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-                                 FormatSwizzle<0, 1>,
-                                 Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8  TransposeT;
-    typedef Format2<8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16_UNORM> - Format traits specialization for R16_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16_SNORM> - Format traits specialization for R16_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16_SNORM>
-    : ComponentTraits<SWR_TYPE_SNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16_SINT> - Format traits specialization for R16_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16_UINT> - Format traits specialization for R16_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16_FLOAT> - Format traits specialization for R16_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<I16_UNORM> - Format traits specialization for I16_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<I16_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L16_UNORM> - Format traits specialization for L16_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L16_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<A16_UNORM> - Format traits specialization for A16_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<A16_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L8A8_UNORM> - Format traits specialization for L8A8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L8A8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-                                  FormatSwizzle<0, 3>,
-                                  Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{1};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8  TransposeT;
-    typedef Format2<8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<I16_FLOAT> - Format traits specialization for I16_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<I16_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L16_FLOAT> - Format traits specialization for L16_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L16_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<A16_FLOAT> - Format traits specialization for A16_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<A16_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L8A8_UNORM_SRGB> - Format traits specialization for L8A8_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L8A8_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-                                       FormatSwizzle<0, 3>,
-                                       Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{1};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8  TransposeT;
-    typedef Format2<8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B5G5R5X1_UNORM> - Format traits specialization for B5G5R5X1_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B5G5R5X1_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose5_5_5_1    TransposeT;
-    typedef Format4<5, 5, 5, 1> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B5G5R5X1_UNORM_SRGB> - Format traits specialization for B5G5R5X1_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B5G5R5X1_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose5_5_5_1    TransposeT;
-    typedef Format4<5, 5, 5, 1> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8_SSCALED> - Format traits specialization for R8G8_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
-                                    FormatSwizzle<0, 1>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8  TransposeT;
-    typedef Format2<8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8_USCALED> - Format traits specialization for R8G8_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
-                                    FormatSwizzle<0, 1>,
-                                    Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8  TransposeT;
-    typedef Format2<8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16_SSCALED> - Format traits specialization for R16_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16_SSCALED>
-    : ComponentTraits<SWR_TYPE_SSCALED, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16_USCALED> - Format traits specialization for R16_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16_USCALED>
-    : ComponentTraits<SWR_TYPE_USCALED, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<16> TransposeT;
-    typedef Format1<16>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<A1B5G5R5_UNORM> - Format traits specialization for A1B5G5R5_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<A1B5G5R5_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 1, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5>,
-      FormatSwizzle<3, 2, 1, 0>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose1_5_5_5    TransposeT;
-    typedef Format4<1, 5, 5, 5> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<A4B4G4R4_UNORM> - Format traits specialization for A4B4G4R4_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<A4B4G4R4_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
-      FormatSwizzle<3, 2, 1, 0>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose4_4_4_4    TransposeT;
-    typedef Format4<4, 4, 4, 4> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L8A8_UINT> - Format traits specialization for L8A8_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L8A8_UINT> : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-                                 FormatSwizzle<0, 3>,
-                                 Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{1};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8  TransposeT;
-    typedef Format2<8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L8A8_SINT> - Format traits specialization for L8A8_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L8A8_SINT> : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-                                 FormatSwizzle<0, 3>,
-                                 Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{16};
-    static const uint32_t numComps{2};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{1};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8  TransposeT;
-    typedef Format2<8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8_UNORM> - Format traits specialization for R8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8_SNORM> - Format traits specialization for R8_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8_SNORM>
-    : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8_SINT> - Format traits specialization for R8_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8_UINT> - Format traits specialization for R8_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<A8_UNORM> - Format traits specialization for A8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<A8_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<I8_UNORM> - Format traits specialization for I8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<I8_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L8_UNORM> - Format traits specialization for L8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L8_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8_SSCALED> - Format traits specialization for R8_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8_SSCALED>
-    : ComponentTraits<SWR_TYPE_SSCALED, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8_USCALED> - Format traits specialization for R8_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8_USCALED>
-    : ComponentTraits<SWR_TYPE_USCALED, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L8_UNORM_SRGB> - Format traits specialization for L8_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L8_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L8_UINT> - Format traits specialization for L8_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L8_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<L8_SINT> - Format traits specialization for L8_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<L8_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<I8_UINT> - Format traits specialization for I8_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<I8_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<I8_SINT> - Format traits specialization for I8_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<I8_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<DXT1_RGB_SRGB> - Format traits specialization for DXT1_RGB_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<DXT1_RGB_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<YCRCB_SWAPUVY> - Format traits specialization for YCRCB_SWAPUVY
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<YCRCB_SWAPUVY>
-    : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{true};
-    static const uint32_t bcWidth{2};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC1_UNORM> - Format traits specialization for BC1_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC1_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC2_UNORM> - Format traits specialization for BC2_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC2_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC3_UNORM> - Format traits specialization for BC3_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC3_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC4_UNORM> - Format traits specialization for BC4_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC4_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC5_UNORM> - Format traits specialization for BC5_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC5_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC1_UNORM_SRGB> - Format traits specialization for BC1_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC1_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{true};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC2_UNORM_SRGB> - Format traits specialization for BC2_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC2_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{true};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC3_UNORM_SRGB> - Format traits specialization for BC3_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC3_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{true};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<YCRCB_SWAPUV> - Format traits specialization for YCRCB_SWAPUV
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<YCRCB_SWAPUV>
-    : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{true};
-    static const uint32_t bcWidth{2};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8_8    TransposeT;
-    typedef Format4<8, 8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<DXT1_RGB> - Format traits specialization for DXT1_RGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<DXT1_RGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8_UNORM> - Format traits specialization for R8G8B8_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{24};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8   TransposeT;
-    typedef Format3<8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8_SNORM> - Format traits specialization for R8G8B8_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8_SNORM>
-    : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{24};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8   TransposeT;
-    typedef Format3<8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8_SSCALED> - Format traits specialization for R8G8B8_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8_SSCALED>
-    : ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{24};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8   TransposeT;
-    typedef Format3<8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8_USCALED> - Format traits specialization for R8G8B8_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8_USCALED>
-    : ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{24};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8   TransposeT;
-    typedef Format3<8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R64G64B64A64_FLOAT> - Format traits specialization for R64G64B64A64_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R64G64B64A64_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT,
-                                                          64,
-                                                          SWR_TYPE_FLOAT,
-                                                          64,
-                                                          SWR_TYPE_FLOAT,
-                                                          64,
-                                                          SWR_TYPE_FLOAT,
-                                                          64>,
-                                          FormatSwizzle<0, 1, 2, 3>,
-                                          Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{256};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose64_64_64_64    TransposeT;
-    typedef Format4<64, 64, 64, 64> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R64G64B64_FLOAT> - Format traits specialization for R64G64B64_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R64G64B64_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{192};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose64_64_64   TransposeT;
-    typedef Format3<64, 64, 64> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC4_SNORM> - Format traits specialization for BC4_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC4_SNORM>
-    : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{64};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC5_SNORM> - Format traits specialization for BC5_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC5_SNORM>
-    : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16_FLOAT> - Format traits specialization for R16G16B16_FLOAT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16_FLOAT>
-    : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{48};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16   TransposeT;
-    typedef Format3<16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16_UNORM> - Format traits specialization for R16G16B16_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{48};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16   TransposeT;
-    typedef Format3<16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16_SNORM> - Format traits specialization for R16G16B16_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16_SNORM>
-    : ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{48};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16   TransposeT;
-    typedef Format3<16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16_SSCALED> - Format traits specialization for R16G16B16_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16_SSCALED>
-    : ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{48};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16   TransposeT;
-    typedef Format3<16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16_USCALED> - Format traits specialization for R16G16B16_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16_USCALED>
-    : ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{48};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16   TransposeT;
-    typedef Format3<16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC6H_SF16> - Format traits specialization for BC6H_SF16
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC6H_SF16>
-    : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC7_UNORM> - Format traits specialization for BC7_UNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC7_UNORM>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC7_UNORM_SRGB> - Format traits specialization for BC7_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC7_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{true};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<BC6H_UF16> - Format traits specialization for BC6H_UF16
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<BC6H_UF16>
-    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{128};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{true};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{4};
-    static const uint32_t bcHeight{4};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8_UNORM_SRGB> - Format traits specialization for R8G8B8_UNORM_SRGB
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8_UNORM_SRGB>
-    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{24};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{true};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8   TransposeT;
-    typedef Format3<8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16_UINT> - Format traits specialization for R16G16B16_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{48};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16   TransposeT;
-    typedef Format3<16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R16G16B16_SINT> - Format traits specialization for R16G16B16_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R16G16B16_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{48};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose16_16_16   TransposeT;
-    typedef Format3<16, 16, 16> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32_SFIXED> - Format traits specialization for R32_SFIXED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R32_SFIXED>
-    : ComponentTraits<SWR_TYPE_SFIXED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<32> TransposeT;
-    typedef Format1<32>                  FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R10G10B10A2_SNORM> - Format traits specialization for R10G10B10A2_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R10G10B10A2_SNORM> : ComponentTraits<SWR_TYPE_SNORM,
-                                                         10,
-                                                         SWR_TYPE_SNORM,
-                                                         10,
-                                                         SWR_TYPE_SNORM,
-                                                         10,
-                                                         SWR_TYPE_SNORM,
-                                                         2>,
-                                         FormatSwizzle<0, 1, 2, 3>,
-                                         Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R10G10B10A2_USCALED> - Format traits specialization for R10G10B10A2_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R10G10B10A2_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
-                                                           10,
-                                                           SWR_TYPE_USCALED,
-                                                           10,
-                                                           SWR_TYPE_USCALED,
-                                                           10,
-                                                           SWR_TYPE_USCALED,
-                                                           2>,
-                                           FormatSwizzle<0, 1, 2, 3>,
-                                           Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R10G10B10A2_SSCALED> - Format traits specialization for R10G10B10A2_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R10G10B10A2_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED,
-                                                           10,
-                                                           SWR_TYPE_SSCALED,
-                                                           10,
-                                                           SWR_TYPE_SSCALED,
-                                                           10,
-                                                           SWR_TYPE_SSCALED,
-                                                           2>,
-                                           FormatSwizzle<0, 1, 2, 3>,
-                                           Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R10G10B10A2_SINT> - Format traits specialization for R10G10B10A2_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R10G10B10A2_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
-      FormatSwizzle<0, 1, 2, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B10G10R10A2_SNORM> - Format traits specialization for B10G10R10A2_SNORM
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B10G10R10A2_SNORM> : ComponentTraits<SWR_TYPE_SNORM,
-                                                         10,
-                                                         SWR_TYPE_SNORM,
-                                                         10,
-                                                         SWR_TYPE_SNORM,
-                                                         10,
-                                                         SWR_TYPE_SNORM,
-                                                         2>,
-                                         FormatSwizzle<2, 1, 0, 3>,
-                                         Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B10G10R10A2_USCALED> - Format traits specialization for B10G10R10A2_USCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B10G10R10A2_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
-                                                           10,
-                                                           SWR_TYPE_USCALED,
-                                                           10,
-                                                           SWR_TYPE_USCALED,
-                                                           10,
-                                                           SWR_TYPE_USCALED,
-                                                           2>,
-                                           FormatSwizzle<2, 1, 0, 3>,
-                                           Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B10G10R10A2_SSCALED> - Format traits specialization for B10G10R10A2_SSCALED
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B10G10R10A2_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED,
-                                                           10,
-                                                           SWR_TYPE_SSCALED,
-                                                           10,
-                                                           SWR_TYPE_SSCALED,
-                                                           10,
-                                                           SWR_TYPE_SSCALED,
-                                                           2>,
-                                           FormatSwizzle<2, 1, 0, 3>,
-                                           Defaults<0, 0, 0, 0x3f800000>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B10G10R10A2_UINT> - Format traits specialization for B10G10R10A2_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B10G10R10A2_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<B10G10R10A2_SINT> - Format traits specialization for B10G10R10A2_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<B10G10R10A2_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
-      FormatSwizzle<2, 1, 0, 3>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{32};
-    static const uint32_t numComps{4};
-    static const bool     hasAlpha{true};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose10_10_10_2    TransposeT;
-    typedef Format4<10, 10, 10, 2> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8_UINT> - Format traits specialization for R8G8B8_UINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8_UINT>
-    : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{24};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8   TransposeT;
-    typedef Format3<8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R8G8B8_SINT> - Format traits specialization for R8G8B8_SINT
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<R8G8B8_SINT>
-    : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-      FormatSwizzle<0, 1, 2>,
-      Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{24};
-    static const uint32_t numComps{3};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{0};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef Transpose8_8_8   TransposeT;
-    typedef Format3<8, 8, 8> FormatT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatTraits<RAW> - Format traits specialization for RAW
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct FormatTraits<RAW>
-    : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
-{
-    static const uint32_t bpp{8};
-    static const uint32_t numComps{1};
-    static const bool     hasAlpha{false};
-    static const uint32_t alphaComp{3};
-    static const bool     isSRGB{false};
-    static const bool     isBC{false};
-    static const bool     isSubsampled{false};
-    static const uint32_t bcWidth{1};
-    static const uint32_t bcHeight{1};
-
-    typedef TransposeSingleComponent<8> TransposeT;
-    typedef Format1<8>                  FormatT;
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
deleted file mode 100644
index 7d7dd843349..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ /dev/null
@@ -1,1629 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file formats.h
- *
- * @brief Definitions for SWR_FORMAT functions.
- *
- ******************************************************************************/
-#pragma once
-
-#include "utils.h"
-#include "common/simdintrin.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// PackTraits - Helpers for packing / unpacking same pixel sizes
-//////////////////////////////////////////////////////////////////////////
-template <uint32_t NumBits, bool Signed = false>
-struct PackTraits
-{
-    static const uint32_t MyNumBits = NumBits;
-
-    static simdscalar     loadSOA(const uint8_t* pSrc)                   = delete;
-    static void           storeSOA(uint8_t* pDst, simdscalar const& src) = delete;
-    static simdscalar     unpack(simdscalar& in)                         = delete;
-    static simdscalar     pack(simdscalar& in)                           = delete;
-
-    static simd16scalar  loadSOA_16(const uint8_t* pSrc)                  = delete;
-    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) = delete;
-    static simd16scalar  unpack(simd16scalar& in)                         = delete;
-    static simd16scalar  pack(simd16scalar& in)                           = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// PackTraits - Helpers for packing / unpacking unused channels
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct PackTraits<0, false>
-{
-    static const uint32_t MyNumBits = 0;
-
-    static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_setzero_ps(); }
-    static void       storeSOA(uint8_t* pDst, simdscalar const& src) { return; }
-    static simdscalar unpack(simdscalar& in) { return _simd_setzero_ps(); }
-    static simdscalar pack(simdscalar& in) { return _simd_setzero_ps(); }
-
-    static simd16scalar  loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); }
-    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { return; }
-    static simd16scalar  unpack(simd16scalar& in) { return _simd16_setzero_ps(); }
-    static simd16scalar  pack(simd16scalar& in) { return _simd16_setzero_ps(); }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct PackTraits<8, false>
-{
-    static const uint32_t MyNumBits = 8;
-
-    static simdscalar loadSOA(const uint8_t* pSrc)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        __m256 result = _mm256_setzero_ps();
-        __m128 vLo    = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
-        return _mm256_insertf128_ps(result, vLo, 0);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static void storeSOA(uint8_t* pDst, simdscalar const& src)
-    {
-        // store simd bytes
-#if KNOB_SIMD_WIDTH == 8
-        _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simdscalar unpack(simdscalar& in)
-    {
-#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH <= KNOB_ARCH_AVX
-        __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
-        __m128i resLo = _mm_cvtepu8_epi32(src);
-        __m128i resHi =
-            _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
-
-        __m256i result = _mm256_castsi128_si256(resLo);
-        result         = _mm256_insertf128_si256(result, resHi, 1);
-        return simdscalar{_mm256_castsi256_ps(result)};
-#else
-        return _mm256_castsi256_ps(
-            _mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
-#endif
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simdscalar pack(simdscalar& in)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src = _simd_castps_si(in);
-        __m128i     res16 =
-            _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
-        __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
-        return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simd16scalar loadSOA_16(const uint8_t* pSrc)
-    {
-        simd16scalar result   = _simd16_setzero_ps();
-        simdscalar   resultlo = _simd_setzero_ps();
-
-        const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
-
-        resultlo = _mm256_insertf128_ps(resultlo, src, 0);
-        result   = _simd16_insert_ps(result, resultlo, 0);
-
-        return result;
-    }
-
-    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
-    {
-        // store simd16 bytes
-        _mm_store_ps(reinterpret_cast<float*>(pDst),
-                     _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
-    }
-
-    static simd16scalar unpack(simd16scalar& in)
-    {
-        simd4scalari  tmp    = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
-        simd16scalari result = _simd16_cvtepu8_epi32(tmp);
-
-        return _simd16_castsi_ps(result);
-    }
-
-    static simd16scalar pack(simd16scalar& in)
-    {
-        // clang-format off
-
-        simd16scalari result = _simd16_setzero_si();
-
-        simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));  // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
-        simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));  // r8 r9 rA rB rC rD rE rF
-
-        simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);   // r0 r1 r2 r3 r8 r9 rA rB (32b)
-        simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);   // r4 r5 r6 r7 rC rD rE rF (32b)
-
-        simdscalari pack = _simd_packus_epi32(permlo, permhi);          // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
-
-        const simdscalari zero = _simd_setzero_si();
-
-        permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0)     // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
-        permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1)     // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
-
-        pack = _simd_packus_epi16(permlo, permhi);                      // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
-
-        result = _simd16_insert_si(result, pack, 0);
-
-        return _simd16_castsi_ps(result);
-
-        // clang-format on
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// PackTraits - Helpers for packing / unpacking 8 bit signed channels
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct PackTraits<8, true>
-{
-    static const uint32_t MyNumBits = 8;
-
-    static simdscalar loadSOA(const uint8_t* pSrc)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        __m256 result = _mm256_setzero_ps();
-        __m128 vLo    = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
-        return _mm256_insertf128_ps(result, vLo, 0);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static void storeSOA(uint8_t* pDst, simdscalar const& src)
-    {
-        // store simd bytes
-#if KNOB_SIMD_WIDTH == 8
-        _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simdscalar unpack(simdscalar& in)
-    {
-#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH <= KNOB_ARCH_AVX
-        SWR_INVALID("I think this may be incorrect.");
-        __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
-        __m128i resLo = _mm_cvtepi8_epi32(src);
-        __m128i resHi =
-            _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
-
-        __m256i result = _mm256_castsi128_si256(resLo);
-        result         = _mm256_insertf128_si256(result, resHi, 1);
-        return _mm256_castsi256_ps(result);
-#else
-        return _mm256_castsi256_ps(
-            _mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
-#endif
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simdscalar pack(simdscalar& in)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src = _simd_castps_si(in);
-        __m128i     res16 =
-            _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
-        __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
-        return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simd16scalar loadSOA_16(const uint8_t* pSrc)
-    {
-        simd16scalar result   = _simd16_setzero_ps();
-        simdscalar   resultlo = _simd_setzero_ps();
-
-        const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
-
-        resultlo = _mm256_insertf128_ps(resultlo, src, 0);
-        result   = _simd16_insert_ps(result, resultlo, 0);
-
-        return result;
-    }
-
-    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
-    {
-        // store simd16 bytes
-        _mm_store_ps(reinterpret_cast<float*>(pDst),
-                     _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
-    }
-
-    static simd16scalar unpack(simd16scalar& in)
-    {
-        simd4scalari  tmp    = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
-        simd16scalari result = _simd16_cvtepu8_epi32(tmp);
-
-        return _simd16_castsi_ps(result);
-    }
-
-    static simd16scalar pack(simd16scalar& in)
-    {
-        // clang-format off
-
-        simd16scalari result = _simd16_setzero_si();
-
-        simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));  // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
-        simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));  // r8 r9 rA rB rC rD rE rF
-
-        simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);   // r0 r1 r2 r3 r8 r9 rA rB (32b)
-        simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);   // r4 r5 r6 r7 rC rD rE rF (32b)
-
-        simdscalari pack = _simd_packs_epi32(permlo, permhi);           // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
-
-        const simdscalari zero = _simd_setzero_si();
-
-        permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0)     // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
-        permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1)     // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
-
-        pack = _simd_packs_epi16(permlo, permhi);                       // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
-
-        result = _simd16_insert_si(result, pack, 0);
-
-        return _simd16_castsi_ps(result);
-
-        // clang-format on
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct PackTraits<16, false>
-{
-    static const uint32_t MyNumBits = 16;
-
-    static simdscalar loadSOA(const uint8_t* pSrc)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        __m256 result = _mm256_setzero_ps();
-        __m128 vLo    = _mm_load_ps((const float*)pSrc);
-        return _mm256_insertf128_ps(result, vLo, 0);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static void storeSOA(uint8_t* pDst, simdscalar const& src)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        // store 16B (2B * 8)
-        _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simdscalar unpack(simdscalar& in)
-    {
-#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH <= KNOB_ARCH_AVX
-        __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
-        __m128i resLo = _mm_cvtepu16_epi32(src);
-        __m128i resHi =
-            _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
-
-        __m256i result = _mm256_castsi128_si256(resLo);
-        result         = _mm256_insertf128_si256(result, resHi, 1);
-        return _mm256_castsi256_ps(result);
-#else
-        return _mm256_castsi256_ps(
-            _mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
-#endif
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simdscalar pack(simdscalar& in)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src = _simd_castps_si(in);
-        __m256i     res = _mm256_castsi128_si256(
-            _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
-        return _mm256_castsi256_ps(res);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simd16scalar loadSOA_16(const uint8_t* pSrc)
-    {
-        simd16scalar result = _simd16_setzero_ps();
-
-        simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
-
-        result = _simd16_insert_ps(result, resultlo, 0);
-
-        return result;
-    }
-
-    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
-    {
-        _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0));
-    }
-
-    static simd16scalar unpack(simd16scalar& in)
-    {
-        simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
-
-        return _simd16_castsi_ps(result);
-    }
-
-    static simd16scalar pack(simd16scalar& in)
-    {
-        // clang-format off
-
-        const simd16scalari zero = _simd16_setzero_si();
-
-        simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
-        simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
-
-        simd16scalari result = _simd16_packus_epi32(permlo, permhi);                        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
-
-        return _simd16_castsi_ps(result);
-
-        // clang-format on
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// PackTraits - Helpers for packing / unpacking 16 bit signed channels
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct PackTraits<16, true>
-{
-    static const uint32_t MyNumBits = 16;
-
-    static simdscalar loadSOA(const uint8_t* pSrc)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        __m256 result = _mm256_setzero_ps();
-        __m128 vLo    = _mm_load_ps((const float*)pSrc);
-        return _mm256_insertf128_ps(result, vLo, 0);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static void storeSOA(uint8_t* pDst, simdscalar const& src)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        // store 16B (2B * 8)
-        _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simdscalar unpack(simdscalar& in)
-    {
-#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH <= KNOB_ARCH_AVX
-        SWR_INVALID("I think this may be incorrect.");
-        __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
-        __m128i resLo = _mm_cvtepi16_epi32(src);
-        __m128i resHi =
-            _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
-
-        __m256i result = _mm256_castsi128_si256(resLo);
-        result         = _mm256_insertf128_si256(result, resHi, 1);
-        return _mm256_castsi256_ps(result);
-#else
-        return _mm256_castsi256_ps(
-            _mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
-#endif
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simdscalar pack(simdscalar& in)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src = _simd_castps_si(in);
-        __m256i     res = _mm256_castsi128_si256(
-            _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
-        return _mm256_castsi256_ps(res);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simd16scalar loadSOA_16(const uint8_t* pSrc)
-    {
-        simd16scalar result = _simd16_setzero_ps();
-
-        simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
-
-        result = _simd16_insert_ps(result, resultlo, 0);
-
-        return result;
-    }
-
-    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
-    {
-        _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0));
-    }
-
-    static simd16scalar unpack(simd16scalar& in)
-    {
-        simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
-
-        return _simd16_castsi_ps(result);
-    }
-
-    static simd16scalar pack(simd16scalar& in)
-    {
-        // clang-format off
-
-        const simd16scalari zero = _simd16_setzero_si();
-
-        simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
-        simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
-
-        simd16scalari result = _simd16_packs_epi32(permlo, permhi);                         // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
-
-        return _simd16_castsi_ps(result);
-
-        // clang-format on
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// PackTraits - Helpers for packing / unpacking 32 bit channels
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct PackTraits<32, false>
-{
-    static const uint32_t MyNumBits = 32;
-
-    static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_load_ps((const float*)pSrc); }
-    static void       storeSOA(uint8_t* pDst, simdscalar const& src)
-    {
-        _simd_store_ps((float*)pDst, src);
-    }
-    static simdscalar unpack(simdscalar& in) { return in; }
-    static simdscalar pack(simdscalar& in) { return in; }
-
-    static simd16scalar loadSOA_16(const uint8_t* pSrc)
-    {
-        return _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
-    }
-
-    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
-    {
-        _simd16_store_ps(reinterpret_cast<float*>(pDst), src);
-    }
-
-    static simd16scalar unpack(simd16scalar& in) { return in; }
-
-    static simd16scalar pack(simd16scalar& in) { return in; }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits.
-//////////////////////////////////////////////////////////////////////////
-template <SWR_TYPE type, uint32_t NumBits>
-struct TypeTraits : PackTraits<NumBits>
-{
-    static const SWR_TYPE MyType = type;
-    static float          toFloat() { return 0.0; }
-    static float          fromFloat()
-    {
-        SWR_NOT_IMPL;
-        return 0.0;
-    }
-    static simdscalar convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UINT8
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_UINT;
-    static float          toFloat() { return 0.0; }
-    static float          fromFloat()
-    {
-        SWR_NOT_IMPL;
-        return 0.0;
-    }
-    static simdscalar convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UINT8
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_SINT;
-    static float          toFloat() { return 0.0; }
-    static float          fromFloat()
-    {
-        SWR_NOT_IMPL;
-        return 0.0;
-    }
-    static simdscalar convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UINT16
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_UINT;
-    static float          toFloat() { return 0.0; }
-    static float          fromFloat()
-    {
-        SWR_NOT_IMPL;
-        return 0.0;
-    }
-    static simdscalar convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for SINT16
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_SINT;
-    static float          toFloat() { return 0.0; }
-    static float          fromFloat()
-    {
-        SWR_NOT_IMPL;
-        return 0.0;
-    }
-    static simdscalar convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UINT32
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_UINT;
-    static float          toFloat() { return 0.0; }
-    static float          fromFloat()
-    {
-        SWR_NOT_IMPL;
-        return 0.0;
-    }
-    static simdscalar convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UINT32
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_SINT;
-    static float          toFloat() { return 0.0; }
-    static float          fromFloat()
-    {
-        SWR_NOT_IMPL;
-        return 0.0;
-    }
-    static simdscalar convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UNORM5
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float          toFloat() { return 1.0f / 31.0f; }
-    static float          fromFloat() { return 31.0f; }
-    static simdscalar     convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UNORM6
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float          toFloat() { return 1.0f / 63.0f; }
-    static float          fromFloat() { return 63.0f; }
-    static simdscalar     convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UNORM8
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float          toFloat() { return 1.0f / 255.0f; }
-    static float          fromFloat() { return 255.0f; }
-    static simdscalar     convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UNORM8
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_SNORM;
-    static float          toFloat() { return 1.0f / 127.0f; }
-    static float          fromFloat() { return 127.0f; }
-    static simdscalar     convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UNORM16
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float          toFloat() { return 1.0f / 65535.0f; }
-    static float          fromFloat() { return 65535.0f; }
-    static simdscalar     convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for SNORM16
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float          toFloat() { return 1.0f / 32767.0f; }
-    static float          fromFloat() { return 32767.0f; }
-    static simdscalar     convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for UNORM24
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_UNORM, 24> : PackTraits<32>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float          toFloat() { return 1.0f / 16777215.0f; }
-    static float          fromFloat() { return 16777215.0f; }
-    static simdscalar     convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-// FLOAT Specializations from here on...
-//////////////////////////////////////////////////////////////////////////
-#define TO_M128i(a) _mm_castps_si128(a)
-#define TO_M128(a) _mm_castsi128_ps(a)
-
-#include "math.h"
-
-template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden>
-inline static __m128 fastpow(__m128 arg)
-{
-    __m128 ret = arg;
-
-    static const __m128 factor =
-        _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) *
-                    powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
-
-    // Apply a constant pre-correction factor.
-    ret = _mm_mul_ps(ret, factor);
-
-    // Reinterpret arg as integer to obtain logarithm.
-    // asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
-    ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
-
-    // Multiply logarithm by power.
-    ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
-
-    // Convert back to "integer" to exponentiate.
-    // asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
-    ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
-
-    return ret;
-}
-
-inline static __m128 pow512_4(__m128 arg)
-{
-    // 5/12 is too small, so compute the 4th root of 20/12 instead.
-    // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
-    // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
-    __m128 xf    = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg);
-    __m128 xover = _mm_mul_ps(arg, xf);
-
-    __m128 xfm1   = _mm_rsqrt_ps(xf);
-    __m128 x2     = _mm_mul_ps(arg, arg);
-    __m128 xunder = _mm_mul_ps(x2, xfm1);
-
-    // sqrt2 * over + 2 * sqrt2 * under
-    __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
-                             _mm_add_ps(xover, xunder));
-
-    xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
-    xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
-    return xavg;
-}
-
-inline static __m128 powf_wrapper(__m128 Base, float Exp)
-{
-    float* f = (float*)(&Base);
-
-    return _mm_set_ps(powf(f[3], Exp), powf(f[2], Exp), powf(f[1], Exp), powf(f[0], Exp));
-}
-
-static inline __m128 ConvertFloatToSRGB2(__m128& Src)
-{
-    // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float
-    // value
-    __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
-
-    // squeeze the mask down to 16 bits (4 bits per DWORD)
-    int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
-
-    __m128 Result;
-
-    //
-    if (CompareResult == 0xFFFF)
-    {
-        // all DWORDs are <= the threshold
-        Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
-    }
-    else if (CompareResult == 0x0)
-    {
-        // all DWORDs are > the threshold
-        __m128 fSrc_0RGB = Src;
-
-        // --> 1.055f * c(1.0f/2.4f) - 0.055f
-#if KNOB_USE_FAST_SRGB == TRUE
-        // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
-        __m128 f = pow512_4(fSrc_0RGB);
-#else
-        __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
-#endif
-        f      = _mm_mul_ps(f, _mm_set1_ps(1.055f));
-        Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
-    }
-    else
-    {
-        // some DWORDs are <= the threshold and some are > threshold
-        __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
-
-        __m128 fSrc_0RGB = Src;
-
-        // --> 1.055f * c(1.0f/2.4f) - 0.055f
-#if KNOB_USE_FAST_SRGB == TRUE
-        // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
-        __m128 f = pow512_4(fSrc_0RGB);
-#else
-        __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
-#endif
-        f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
-        f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
-
-        // Clear the alpha (is garbage after the sub)
-        __m128i i = _mm_and_si128(TO_M128i(f),
-                                  _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
-
-        __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
-        __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
-        __m128i CombinedParts    = _mm_or_si128(LessThanPart, GreaterEqualPart);
-
-        Result = TO_M128(CombinedParts);
-    }
-
-    return Result;
-}
-
-template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden>
-inline static simd16scalar SIMDCALL fastpow(simd16scalar const& value)
-{
-    static const float factor1 = exp2(127.0f * expden / expnum - 127.0f) *
-                                 powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
-
-    // Apply a constant pre-correction factor.
-    simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1));
-
-    // Reinterpret arg as integer to obtain logarithm.
-    // asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
-    result = _simd16_cvtepi32_ps(_simd16_castps_si(result));
-
-    // Multiply logarithm by power.
-    result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden));
-
-    // Convert back to "integer" to exponentiate.
-    // asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
-    result = _simd16_castsi_ps(_simd16_cvtps_epi32(result));
-
-    return result;
-}
-
-inline static simd16scalar SIMDCALL pow512_4(simd16scalar const& arg)
-{
-    // 5/12 is too small, so compute the 4th root of 20/12 instead.
-    // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
-    // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
-    simd16scalar xf    = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg);
-    simd16scalar xover = _simd16_mul_ps(arg, xf);
-
-    simd16scalar xfm1   = _simd16_rsqrt_ps(xf);
-    simd16scalar x2     = _simd16_mul_ps(arg, arg);
-    simd16scalar xunder = _simd16_mul_ps(x2, xfm1);
-
-    // sqrt2 * over + 2 * sqrt2 * under
-    simd16scalar xavg =
-        _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
-                       _simd16_add_ps(xover, xunder));
-
-    xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
-    xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
-
-    return xavg;
-}
-
-inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar& base, float exp)
-{
-    const float* f = reinterpret_cast<const float*>(&base);
-
-    return _simd16_set_ps(powf(f[15], exp),
-                          powf(f[14], exp),
-                          powf(f[13], exp),
-                          powf(f[12], exp),
-                          powf(f[11], exp),
-                          powf(f[10], exp),
-                          powf(f[9], exp),
-                          powf(f[8], exp),
-                          powf(f[7], exp),
-                          powf(f[6], exp),
-                          powf(f[5], exp),
-                          powf(f[4], exp),
-                          powf(f[3], exp),
-                          powf(f[2], exp),
-                          powf(f[1], exp),
-                          powf(f[0], exp));
-}
-
-// float to SRGB conversion formula
-//
-// if (value < 0.0031308f)
-//     value *= 12.92f;
-// else
-//     value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
-//
-static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value)
-{
-    // create a mask where the source is < the minimal SRGB float value
-    const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f));
-
-    // if all elements are < the threshold, result = value * 12.92
-    simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f));
-
-    if (_simd16_mask2int(mask) != 0xFFFF)
-    {
-        // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
-#if KNOB_USE_FAST_SRGB == TRUE
-        // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
-        simd16scalar result2 = pow512_4(value);
-#else
-        simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f);
-#endif
-
-        result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f));
-        result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f));
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX512)
-        // only native AVX512 can directly use the computed mask for the blend operation
-        result = _mm512_mask_blend_ps(mask, result2, result);
-#else
-        result = _simd16_blendv_ps(
-            result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
-#endif
-    }
-
-    return result;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for FLOAT16
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
-{
-    static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
-    static float          toFloat() { return 1.0f; }
-    static float          fromFloat() { return 1.0f; }
-    static simdscalar     convertSrgb(simdscalar& in)
-    {
-        SWR_NOT_IMPL;
-        return _simd_setzero_ps();
-    }
-
-    static simdscalar pack(const simdscalar& in)
-    {
-#if KNOB_SIMD_WIDTH == 8
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-        // input is 8 packed float32, output is 8 packed float16
-        simdscalari src = _simd_castps_si(in);
-
-        static const uint32_t FLOAT_EXP_BITS      = 8;
-        static const uint32_t FLOAT_MANTISSA_BITS = 23;
-        static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
-        static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
-
-        static const uint32_t HALF_EXP_BITS      = 5;
-        static const uint32_t HALF_MANTISSA_BITS = 10;
-        static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
-
-        // minimum exponent required, exponents below this are flushed to 0.
-        static const int32_t HALF_EXP_MIN   = -14;
-        static const int32_t FLOAT_EXP_BIAS = 127;
-        static const int32_t FLOAT_EXP_MIN  = HALF_EXP_MIN + FLOAT_EXP_BIAS;
-        static const int32_t FLOAT_EXP_MIN_FTZ =
-            FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
-
-        // maximum exponent required, exponents above this are set to infinity
-        static const int32_t HALF_EXP_MAX  = 15;
-        static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
-
-        const simdscalari vSignMask = _simd_set1_epi32(0x80000000);
-        const simdscalari vExpMask  = _simd_set1_epi32(FLOAT_EXP_MASK);
-        const simdscalari vManMask  = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
-        const simdscalari vExpMin =
-            _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
-        const simdscalari vExpMinFtz =
-            _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
-        const simdscalari vExpMax =
-            _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
-
-        simdscalari vSign = _simd_and_si(src, vSignMask);
-        simdscalari vExp  = _simd_and_si(src, vExpMask);
-        simdscalari vMan  = _simd_and_si(src, vManMask);
-
-        simdscalari vFTZMask    = _simd_cmplt_epi32(vExp, vExpMinFtz);
-        simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
-        simdscalari vInfMask    = _simd_cmpeq_epi32(vExpMask, vExp);
-        simdscalari vClampMask  = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
-
-        simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin),
-                                               _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
-
-        // pack output 16-bits into the lower 16-bits of each 32-bit channel
-        simdscalari vDst =
-            _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
-        vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
-
-        // Flush To Zero
-        vDst = _simd_andnot_si(vFTZMask, vDst);
-        // Apply Infinites / NaN
-        vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
-
-        // Apply clamps
-        vDst = _simd_andnot_si(vClampMask, vDst);
-        vDst = _simd_or_si(vDst, _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
-
-        // Compute Denormals (subnormals)
-        if (!_mm256_testz_si256(vDenormMask, vDenormMask))
-        {
-            uint32_t* pDenormMask = (uint32_t*)&vDenormMask;
-            uint32_t* pExp        = (uint32_t*)&vExp;
-            uint32_t* pMan        = (uint32_t*)&vMan;
-            uint32_t* pDst        = (uint32_t*)&vDst;
-            for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
-            {
-                if (pDenormMask[i])
-                {
-                    // Need to compute subnormal value
-                    uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
-                    uint32_t mantissa =
-                        pMan[i] | (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s.
-                                                               // Make it explicit
-
-                    pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) +
-                                           (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
-                }
-            }
-        }
-
-        // Add in sign bits
-        vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
-
-        // Pack to lower 128-bits
-        vDst = _mm256_castsi128_si256(
-            _mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
-
-#if 0
-#if !defined(NDEBUG)
-        simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
-
-        for (uint32_t i = 0; i < 4; ++i)
-        {
-            SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
-        }
-#endif
-#endif
-
-        return _simd_castsi_ps(vDst);
-
-#else
-        return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
-#endif
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    static simdscalar unpack(const simdscalar& in)
-    {
-        // input is 8 packed float16, output is 8 packed float32
-        SWR_NOT_IMPL; // @todo
-        return _simd_setzero_ps();
-    }
-
-    static simd16scalar pack(const simd16scalar& in)
-    {
-        simd16scalari result   = _simd16_setzero_si();
-        simdscalari   resultlo = _simd_setzero_si();
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-        simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
-        simdscalar simdhi = pack(_simd16_extract_ps(in, 1));
-
-        __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0);
-        __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0);
-
-#else
-        __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
-        __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);
-
-#endif
-        resultlo = _simd_insertf128_si(resultlo, templo, 0);
-        resultlo = _simd_insertf128_si(resultlo, temphi, 1);
-
-        result = _simd16_insert_si(result, resultlo, 0);
-
-        return _simd16_castsi_ps(result);
-    }
-
-    static simd16scalar unpack(const simd16scalar& in)
-    {
-        // input is 16 packed float16, output is 16 packed float32
-        SWR_NOT_IMPL; //  @todo
-        return _simd16_setzero_ps();
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// TypeTraits - Format type traits specialization for FLOAT32
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
-{
-    static const SWR_TYPE    MyType = SWR_TYPE_FLOAT;
-    static float             toFloat() { return 1.0f; }
-    static float             fromFloat() { return 1.0f; }
-    static inline simdscalar convertSrgb(simdscalar& in)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        __m128 srcLo = _mm256_extractf128_ps(in, 0);
-        __m128 srcHi = _mm256_extractf128_ps(in, 1);
-
-        srcLo = ConvertFloatToSRGB2(srcLo);
-        srcHi = ConvertFloatToSRGB2(srcHi);
-
-        in = _mm256_insertf128_ps(in, srcLo, 0);
-        in = _mm256_insertf128_ps(in, srcHi, 1);
-#else
-#error Unsupported vector width
-#endif
-        return in;
-    }
-
-    static inline simd16scalar convertSrgb(simd16scalar& in) { return ConvertFloatToSRGB2(in); }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FormatIntType - Calculate base integer type for pixel components based
-///                 on total number of bits.  Components can be smaller
-///                 that this type, but the entire pixel must not be
-///                 any smaller than this type.
-//////////////////////////////////////////////////////////////////////////
-template <uint32_t bits, bool bits8 = bits <= 8, bool bits16 = bits <= 16>
-struct FormatIntType
-{
-    typedef uint32_t TYPE;
-};
-
-template <uint32_t bits>
-struct FormatIntType<bits, true, true>
-{
-    typedef uint8_t TYPE;
-};
-
-template <uint32_t bits>
-struct FormatIntType<bits, false, true>
-{
-    typedef uint16_t TYPE;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Format1 - Bitfield for single component formats.
-//////////////////////////////////////////////////////////////////////////
-template <uint32_t x>
-union Format1
-{
-    typedef typename FormatIntType<x>::TYPE TYPE;
-    struct
-    {
-        TYPE r : x;
-    };
-
-    ///@ The following are here to provide full template needed in Formats.
-    struct
-    {
-        TYPE g : x;
-    };
-    struct
-    {
-        TYPE b : x;
-    };
-    struct
-    {
-        TYPE a : x;
-    };
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Format2 - Bitfield for 2 component formats.
-//////////////////////////////////////////////////////////////////////////
-template <uint32_t x, uint32_t y>
-union Format2
-{
-    typedef typename FormatIntType<x + y>::TYPE TYPE;
-
-    struct
-    {
-        TYPE r : x;
-        TYPE g : y;
-    };
-    struct
-    {
-        ///@ The following are here to provide full template needed in Formats.
-        TYPE b : x;
-        TYPE a : y;
-    };
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Format3 - Bitfield for 3 component formats.
-//////////////////////////////////////////////////////////////////////////
-template <uint32_t x, uint32_t y, uint32_t z>
-union Format3
-{
-    typedef typename FormatIntType<x + y + z>::TYPE TYPE;
-
-    struct
-    {
-        TYPE r : x;
-        TYPE g : y;
-        TYPE b : z;
-    };
-    TYPE a; ///@note This is here to provide full template needed in Formats.
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Format4 - Bitfield for 4 component formats.
-//////////////////////////////////////////////////////////////////////////
-template <uint32_t x, uint32_t y, uint32_t z, uint32_t w>
-struct Format4
-{
-    typedef typename FormatIntType<x + y + z + w>::TYPE TYPE;
-
-    TYPE r : x;
-    TYPE g : y;
-    TYPE b : z;
-    TYPE a : w;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// ComponentTraits - Default components
-//////////////////////////////////////////////////////////////////////////
-template <uint32_t x, uint32_t y, uint32_t z, uint32_t w>
-struct Defaults
-{
-    INLINE static uint32_t GetDefault(uint32_t comp)
-    {
-        static const uint32_t defaults[4]{x, y, z, w};
-        return defaults[comp];
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// ComponentTraits - Component type traits.
-//////////////////////////////////////////////////////////////////////////
-template <SWR_TYPE X,
-          uint32_t NumBitsX,
-          SWR_TYPE Y        = SWR_TYPE_UNKNOWN,
-          uint32_t NumBitsY = 0,
-          SWR_TYPE Z        = SWR_TYPE_UNKNOWN,
-          uint32_t NumBitsZ = 0,
-          SWR_TYPE W        = SWR_TYPE_UNKNOWN,
-          uint32_t NumBitsW = 0>
-struct ComponentTraits
-{
-    INLINE static SWR_TYPE GetType(uint32_t comp)
-    {
-        static const SWR_TYPE CompType[4]{X, Y, Z, W};
-        return CompType[comp];
-    }
-
-    INLINE static constexpr uint32_t GetConstBPC(uint32_t comp)
-    {
-        return (comp == 3) ? NumBitsW
-                           : ((comp == 2) ? NumBitsZ : ((comp == 1) ? NumBitsY : NumBitsX));
-    }
-
-    INLINE static uint32_t GetBPC(uint32_t comp)
-    {
-        static const uint32_t MyBpc[4]{NumBitsX, NumBitsY, NumBitsZ, NumBitsW};
-        return MyBpc[comp];
-    }
-
-    INLINE static bool isNormalized(uint32_t comp)
-    {
-        switch (comp)
-        {
-        case 0:
-            return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
-        case 1:
-            return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
-        case 2:
-            return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
-        case 3:
-            return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-        return false;
-    }
-
-    INLINE static float toFloat(uint32_t comp)
-    {
-        switch (comp)
-        {
-        case 0:
-            return TypeTraits<X, NumBitsX>::toFloat();
-        case 1:
-            return TypeTraits<Y, NumBitsY>::toFloat();
-        case 2:
-            return TypeTraits<Z, NumBitsZ>::toFloat();
-        case 3:
-            return TypeTraits<W, NumBitsW>::toFloat();
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-        return TypeTraits<X, NumBitsX>::toFloat();
-    }
-
-    INLINE static float fromFloat(uint32_t comp)
-    {
-        switch (comp)
-        {
-        case 0:
-            return TypeTraits<X, NumBitsX>::fromFloat();
-        case 1:
-            return TypeTraits<Y, NumBitsY>::fromFloat();
-        case 2:
-            return TypeTraits<Z, NumBitsZ>::fromFloat();
-        case 3:
-            return TypeTraits<W, NumBitsW>::fromFloat();
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-        return TypeTraits<X, NumBitsX>::fromFloat();
-    }
-
-    INLINE static void loadSOA(uint32_t comp, const uint8_t* pSrc, simdscalar& dst)
-    {
-        switch (comp)
-        {
-        case 0:
-            dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc);
-            return;
-        case 1:
-            dst = TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
-            return;
-        case 2:
-            dst = TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
-            return;
-        case 3:
-            dst = TypeTraits<W, NumBitsW>::loadSOA(pSrc);
-            return;
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-        dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc);
-    }
-
-    INLINE static void storeSOA(uint32_t comp, uint8_t* pDst, simdscalar const& src)
-    {
-        switch (comp)
-        {
-        case 0:
-            TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
-            return;
-        case 1:
-            TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
-            return;
-        case 2:
-            TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
-            return;
-        case 3:
-            TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
-            return;
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-    }
-
-    INLINE static simdscalar unpack(uint32_t comp, simdscalar& in)
-    {
-        simdscalar out;
-        switch (comp)
-        {
-        case 0:
-            out = TypeTraits<X, NumBitsX>::unpack(in);
-            break;
-        case 1:
-            out = TypeTraits<Y, NumBitsY>::unpack(in);
-            break;
-        case 2:
-            out = TypeTraits<Z, NumBitsZ>::unpack(in);
-            break;
-        case 3:
-            out = TypeTraits<W, NumBitsW>::unpack(in);
-            break;
-        default:
-            SWR_INVALID("Invalid component: %d", comp);
-            out = in;
-            break;
-        }
-        return out;
-    }
-
-    INLINE static simdscalar pack(uint32_t comp, simdscalar& in)
-    {
-        simdscalar out;
-        switch (comp)
-        {
-        case 0:
-            out = TypeTraits<X, NumBitsX>::pack(in);
-            break;
-        case 1:
-            out = TypeTraits<Y, NumBitsY>::pack(in);
-            break;
-        case 2:
-            out = TypeTraits<Z, NumBitsZ>::pack(in);
-            break;
-        case 3:
-            out = TypeTraits<W, NumBitsW>::pack(in);
-            break;
-        default:
-            SWR_INVALID("Invalid component: %d", comp);
-            out = in;
-            break;
-        }
-        return out;
-    }
-
-    INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar& in)
-    {
-        switch (comp)
-        {
-        case 0:
-            return TypeTraits<X, NumBitsX>::convertSrgb(in);
-        case 1:
-            return TypeTraits<Y, NumBitsY>::convertSrgb(in);
-        case 2:
-            return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
-        case 3:
-            return TypeTraits<W, NumBitsW>::convertSrgb(in);
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-        return TypeTraits<X, NumBitsX>::convertSrgb(in);
-    }
-
-    INLINE static void SIMDCALL loadSOA(uint32_t comp, const uint8_t* pSrc, simd16scalar& dst)
-    {
-        switch (comp)
-        {
-        case 0:
-            dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
-            return;
-        case 1:
-            dst = TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
-            return;
-        case 2:
-            dst = TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
-            return;
-        case 3:
-            dst = TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
-            return;
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-        dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
-    }
-
-    INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t* pDst, simd16scalar const& src)
-    {
-        switch (comp)
-        {
-        case 0:
-            TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
-            return;
-        case 1:
-            TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
-            return;
-        case 2:
-            TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
-            return;
-        case 3:
-            TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
-            return;
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-        TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
-    }
-
-    INLINE static simd16scalar unpack(uint32_t comp, simd16scalar& in)
-    {
-        switch (comp)
-        {
-        case 0:
-            return TypeTraits<X, NumBitsX>::unpack(in);
-        case 1:
-            return TypeTraits<Y, NumBitsY>::unpack(in);
-        case 2:
-            return TypeTraits<Z, NumBitsZ>::unpack(in);
-        case 3:
-            return TypeTraits<W, NumBitsW>::unpack(in);
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-        return TypeTraits<X, NumBitsX>::unpack(in);
-    }
-
-    INLINE static simd16scalar pack(uint32_t comp, simd16scalar& in)
-    {
-        switch (comp)
-        {
-        case 0:
-            return TypeTraits<X, NumBitsX>::pack(in);
-        case 1:
-            return TypeTraits<Y, NumBitsY>::pack(in);
-        case 2:
-            return TypeTraits<Z, NumBitsZ>::pack(in);
-        case 3:
-            return TypeTraits<W, NumBitsW>::pack(in);
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-        return TypeTraits<X, NumBitsX>::pack(in);
-    }
-
-    INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar& in)
-    {
-        switch (comp)
-        {
-        case 0:
-            return TypeTraits<X, NumBitsX>::convertSrgb(in);
-        case 1:
-            return TypeTraits<Y, NumBitsY>::convertSrgb(in);
-        case 2:
-            return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
-        case 3:
-            return TypeTraits<W, NumBitsW>::convertSrgb(in);
-        }
-        SWR_INVALID("Invalid component: %d", comp);
-        return TypeTraits<X, NumBitsX>::convertSrgb(in);
-    }
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_utils.h b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
deleted file mode 100644
index 7c0b62f1910..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/format_utils.h
+++ /dev/null
@@ -1,939 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file utils.h
- *
- * @brief Utilities used by SWR core related to pixel formats.
- *
- ******************************************************************************/
-#pragma once
-
-#include "core/utils.h"
-#include "common/simdintrin.h"
-
-INLINE
-void vTranspose(simd4scalar& row0, simd4scalar& row1, simd4scalar& row2, simd4scalar& row3)
-{
-    simd4scalari row0i = SIMD128::castps_si(row0);
-    simd4scalari row1i = SIMD128::castps_si(row1);
-    simd4scalari row2i = SIMD128::castps_si(row2);
-    simd4scalari row3i = SIMD128::castps_si(row3);
-
-    simd4scalari vTemp = row2i;
-    row2i              = SIMD128::unpacklo_epi32(row2i, row3i);
-    vTemp              = SIMD128::unpackhi_epi32(vTemp, row3i);
-
-    row3i = row0i;
-    row0i = SIMD128::unpacklo_epi32(row0i, row1i);
-    row3i = SIMD128::unpackhi_epi32(row3i, row1i);
-
-    row1i = row0i;
-    row0i = SIMD128::unpacklo_epi64(row0i, row2i);
-    row1i = SIMD128::unpackhi_epi64(row1i, row2i);
-
-    row2i = row3i;
-    row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
-    row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
-
-    row0 = SIMD128::castsi_ps(row0i);
-    row1 = SIMD128::castsi_ps(row1i);
-    row2 = SIMD128::castsi_ps(row2i);
-    row3 = SIMD128::castsi_ps(row3i);
-}
-
-INLINE
-void vTranspose(simd4scalari& row0, simd4scalari& row1, simd4scalari& row2, simd4scalari& row3)
-{
-    simd4scalari vTemp = row2;
-    row2               = SIMD128::unpacklo_epi32(row2, row3);
-    vTemp              = SIMD128::unpackhi_epi32(vTemp, row3);
-
-    row3 = row0;
-    row0 = SIMD128::unpacklo_epi32(row0, row1);
-    row3 = SIMD128::unpackhi_epi32(row3, row1);
-
-    row1 = row0;
-    row0 = SIMD128::unpacklo_epi64(row0, row2);
-    row1 = SIMD128::unpackhi_epi64(row1, row2);
-
-    row2 = row3;
-    row2 = SIMD128::unpacklo_epi64(row2, vTemp);
-    row3 = SIMD128::unpackhi_epi64(row3, vTemp);
-}
-
-#if KNOB_SIMD_WIDTH == 8
-INLINE
-void vTranspose3x8(simd4scalar (&vDst)[8],
-                   const simdscalar& vSrc0,
-                   const simdscalar& vSrc1,
-                   const simdscalar& vSrc2)
-{
-    simdscalar r0r2       = _simd_unpacklo_ps(vSrc0, vSrc2);              // x0z0x1z1 x4z4x5z5
-    simdscalar r1rx       = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); // y0w0y1w1 y4w4y5w5
-    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);                // x0y0z0w0 x4y4z4w4
-    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);                // x1y1z1w1 x5y5z5w5
-
-    r0r2                  = _simd_unpackhi_ps(vSrc0, vSrc2);              // x2z2x3z3 x6z6x7z7
-    r1rx                  = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); // y2w2y3w3 y6w6yw77
-    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);                // x2y2z2w2 x6y6z6w6
-    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);                // x3y3z3w3 x7y7z7w7
-
-    vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
-    vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
-    vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
-    vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
-
-    vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
-    vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
-    vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
-    vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
-}
-
-INLINE
-void vTranspose4x8(simd4scalar (&vDst)[8],
-                   const simdscalar& vSrc0,
-                   const simdscalar& vSrc1,
-                   const simdscalar& vSrc2,
-                   const simdscalar& vSrc3)
-{
-    simdscalar r0r2       = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5
-    simdscalar r1rx       = _simd_unpacklo_ps(vSrc1, vSrc3); // y0w0y1w1 y4w4y5w5
-    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);   // x0y0z0w0 x4y4z4w4
-    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);   // x1y1z1w1 x5y5z5w5
-
-    r0r2                  = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7
-    r1rx                  = _simd_unpackhi_ps(vSrc1, vSrc3); // y2w2y3w3 y6w6yw77
-    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);   // x2y2z2w2 x6y6z6w6
-    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);   // x3y3z3w3 x7y7z7w7
-
-    vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
-    vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
-    vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
-    vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
-
-    vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
-    vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
-    vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
-    vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
-}
-
-INLINE
-void vTranspose4x16(simd16scalar (&dst)[4],
-                    const simd16scalar& src0,
-                    const simd16scalar& src1,
-                    const simd16scalar& src2,
-                    const simd16scalar& src3)
-{
-    const simd16scalari perm =
-        _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
-
-    // pre-permute input to setup the right order after all the unpacking
-
-    simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
-    simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
-    simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
-    simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
-
-    simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
-    simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
-    simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
-    simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
-
-    dst[0] = _simd16_unpacklo_ps(rblo, galo);
-    dst[1] = _simd16_unpackhi_ps(rblo, galo);
-    dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
-    dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
-}
-
-INLINE
-void vTranspose8x8(simdscalar (&vDst)[8],
-                   const simdscalar& vMask0,
-                   const simdscalar& vMask1,
-                   const simdscalar& vMask2,
-                   const simdscalar& vMask3,
-                   const simdscalar& vMask4,
-                   const simdscalar& vMask5,
-                   const simdscalar& vMask6,
-                   const simdscalar& vMask7)
-{
-    simdscalar __t0  = _simd_unpacklo_ps(vMask0, vMask1);
-    simdscalar __t1  = _simd_unpackhi_ps(vMask0, vMask1);
-    simdscalar __t2  = _simd_unpacklo_ps(vMask2, vMask3);
-    simdscalar __t3  = _simd_unpackhi_ps(vMask2, vMask3);
-    simdscalar __t4  = _simd_unpacklo_ps(vMask4, vMask5);
-    simdscalar __t5  = _simd_unpackhi_ps(vMask4, vMask5);
-    simdscalar __t6  = _simd_unpacklo_ps(vMask6, vMask7);
-    simdscalar __t7  = _simd_unpackhi_ps(vMask6, vMask7);
-    simdscalar __tt0 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0));
-    simdscalar __tt1 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2));
-    simdscalar __tt2 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0));
-    simdscalar __tt3 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2));
-    simdscalar __tt4 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0));
-    simdscalar __tt5 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2));
-    simdscalar __tt6 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0));
-    simdscalar __tt7 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2));
-    vDst[0]          = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
-    vDst[1]          = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
-    vDst[2]          = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
-    vDst[3]          = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
-    vDst[4]          = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
-    vDst[5]          = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
-    vDst[6]          = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
-    vDst[7]          = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
-}
-
-INLINE
-void vTranspose8x8(simdscalar (&vDst)[8],
-                   const simdscalari& vMask0,
-                   const simdscalari& vMask1,
-                   const simdscalari& vMask2,
-                   const simdscalari& vMask3,
-                   const simdscalari& vMask4,
-                   const simdscalari& vMask5,
-                   const simdscalari& vMask6,
-                   const simdscalari& vMask7)
-{
-    vTranspose8x8(vDst,
-                  _simd_castsi_ps(vMask0),
-                  _simd_castsi_ps(vMask1),
-                  _simd_castsi_ps(vMask2),
-                  _simd_castsi_ps(vMask3),
-                  _simd_castsi_ps(vMask4),
-                  _simd_castsi_ps(vMask5),
-                  _simd_castsi_ps(vMask6),
-                  _simd_castsi_ps(vMask7));
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-/// TranposeSingleComponent
-//////////////////////////////////////////////////////////////////////////
-template <uint32_t bpp>
-struct TransposeSingleComponent
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Pass-thru for single component.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose8_8_8_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose8_8_8_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
-
-#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH <= KNOB_ARCH_AVX
-        simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
-        simd4scalari c2c3 =
-            SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
-        simd4scalari c0c2    = SIMD128::unpacklo_epi64(c0c1, c2c3);            // rrrrrrrrbbbbbbbb
-        simd4scalari c1c3    = SIMD128::unpackhi_epi64(c0c1, c2c3);            // ggggggggaaaaaaaa
-        simd4scalari c01     = SIMD128::unpacklo_epi8(c0c2, c1c3);             // rgrgrgrgrgrgrgrg
-        simd4scalari c23     = SIMD128::unpackhi_epi8(c0c2, c1c3);             // babababababababa
-        simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23);              // rgbargbargbargba
-        simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23);              // rgbargbargbargba
-        SIMD128::store_si((simd4scalari*)pDst, c0123lo);
-        SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
-#else
-        simdscalari dst01 = _simd_shuffle_epi8(src,
-                                               _simd_set_epi32(0x0f078080,
-                                                               0x0e068080,
-                                                               0x0d058080,
-                                                               0x0c048080,
-                                                               0x80800b03,
-                                                               0x80800a02,
-                                                               0x80800901,
-                                                               0x80800800));
-        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
-        dst23             = _simd_shuffle_epi8(dst23,
-                                   _simd_set_epi32(0x80800f07,
-                                                   0x80800e06,
-                                                   0x80800d05,
-                                                   0x80800c04,
-                                                   0x0b038080,
-                                                   0x0a028080,
-                                                   0x09018080,
-                                                   0x08008080));
-        simdscalari dst   = _simd_or_si(dst01, dst23);
-        _simd_store_si((simdscalari*)pDst, dst);
-#endif
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc));      // rrrrrrrrrrrrrrrr
-        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1);  // gggggggggggggggg
-        simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2);  // bbbbbbbbbbbbbbbb
-        simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3);  // aaaaaaaaaaaaaaaa
-
-        simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
-        simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
-        simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
-        simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
-
-        simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
-        simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
-        simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
-
-        simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
-
-        _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst), dst);  // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose8_8_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose8_8_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose8_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose8_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
-
-        simd4scalari rg = src.v4[0];                       // rrrrrrrr gggggggg
-        simd4scalari g  = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
-        rg              = SIMD128::unpacklo_epi8(rg, g);
-        SIMD128::store_si((simd4scalari*)pDst, rg);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc));      // rrrrrrrrrrrrrrrr
-        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1);  // gggggggggggggggg
-
-        simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
-        simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
-
-        simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
-
-        simdscalari dst = _simd_or_si(cvt0, shl1);
-
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst), dst);  // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_32_32_32
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_32_32_32
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalar src0 = _simd_load_ps((const float*)pSrc);
-        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
-        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
-        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
-
-        simd4scalar vDst[8];
-        vTranspose4x8(vDst, src0, src1, src2, src3);
-        SIMD128::store_ps((float*)pDst, vDst[0]);
-        SIMD128::store_ps((float*)pDst + 4, vDst[1]);
-        SIMD128::store_ps((float*)pDst + 8, vDst[2]);
-        SIMD128::store_ps((float*)pDst + 12, vDst[3]);
-        SIMD128::store_ps((float*)pDst + 16, vDst[4]);
-        SIMD128::store_ps((float*)pDst + 20, vDst[5]);
-        SIMD128::store_ps((float*)pDst + 24, vDst[6]);
-        SIMD128::store_ps((float*)pDst + 28, vDst[7]);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
-        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
-        simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
-
-        simd16scalar dst[4];
-
-        vTranspose4x16(dst, src0, src1, src2, src3);
-
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) +  0, dst[0]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_32_32
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_32_32
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalar src0 = _simd_load_ps((const float*)pSrc);
-        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
-        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
-
-        simd4scalar vDst[8];
-        vTranspose3x8(vDst, src0, src1, src2);
-        SIMD128::store_ps((float*)pDst, vDst[0]);
-        SIMD128::store_ps((float*)pDst + 4, vDst[1]);
-        SIMD128::store_ps((float*)pDst + 8, vDst[2]);
-        SIMD128::store_ps((float*)pDst + 12, vDst[3]);
-        SIMD128::store_ps((float*)pDst + 16, vDst[4]);
-        SIMD128::store_ps((float*)pDst + 20, vDst[5]);
-        SIMD128::store_ps((float*)pDst + 24, vDst[6]);
-        SIMD128::store_ps((float*)pDst + 28, vDst[7]);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
-        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
-        simd16scalar src3 = _simd16_setzero_ps();
-
-        simd16scalar dst[4];
-
-        vTranspose4x16(dst, src0, src1, src2, src3);
-
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) +  0, dst[0]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_32
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_32
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        const float* pfSrc  = (const float*)pSrc;
-        simd4scalar  src_r0 = SIMD128::load_ps(pfSrc + 0);
-        simd4scalar  src_r1 = SIMD128::load_ps(pfSrc + 4);
-        simd4scalar  src_g0 = SIMD128::load_ps(pfSrc + 8);
-        simd4scalar  src_g1 = SIMD128::load_ps(pfSrc + 12);
-
-        simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
-        simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
-        simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
-        simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
-
-        float* pfDst = (float*)pDst;
-        SIMD128::store_ps(pfDst + 0, dst0);
-        SIMD128::store_ps(pfDst + 4, dst1);
-        SIMD128::store_ps(pfDst + 8, dst2);
-        SIMD128::store_ps(pfDst + 12, dst3);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));      // rrrrrrrrrrrrrrrr
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg
-
-        simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                            // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
-        simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                            // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
-
-        simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0)  // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
-        simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2)  // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
-
-        simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0)  // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
-        simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0)  // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
-
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) +  0, dst0);                    // rgrgrgrgrgrgrgrg
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1);                    // rgrgrgrgrgrgrgrg
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose16_16_16_16
-//////////////////////////////////////////////////////////////////////////
-struct Transpose16_16_16_16
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
-        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
-
-        simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
-        simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
-        simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
-        simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
-
-        simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
-        simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
-        simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
-        simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
-
-        simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
-        simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
-        simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
-        simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
-
-        SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
-        SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
-        SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
-        SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));       // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1);   // gggggggggggggggg
-        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2);   // bbbbbbbbbbbbbbbb
-        simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3);   // aaaaaaaaaaaaaaaa
-
-        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                    // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                    // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                    // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
-        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                    // ba4 ba5 ba6 ba7 baC baD baE baF
-
-        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                    // rbga0 rbga1 rbga8 rbga9
-        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                    // rbga2 rbga3 rbgaA rbgaB
-        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                    // rbga4 rbga5 rgbaC rbgaD
-        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                    // rbga6 rbga7 rbgaE rbgaF
-
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)   // rbga0 rbga1 rbga2 rbga3
-        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)   // rbga4 rbga5 rbga6 rbga7
-        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)   // rbga8 rbga9 rbgaA rbgaB
-        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)   // rbgaC rbgaD rbgaE rbgaF
-
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3);         // rgbargbargbargba
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose16_16_16
-//////////////////////////////////////////////////////////////////////////
-struct Transpose16_16_16
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
-
-        simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
-        simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
-        simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
-        simd4scalari src_a = SIMD128::setzero_si();
-
-        simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
-        simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
-        simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
-        simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
-
-        simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
-        simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
-        simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
-        simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
-
-        SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
-        SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
-        SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
-        SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));       // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1);   // gggggggggggggggg
-        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2);   // bbbbbbbbbbbbbbbb
-        simdscalari src3 = _simd_setzero_si();                                              // aaaaaaaaaaaaaaaa
-
-        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                    // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                    // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                    // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
-        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                    // ba4 ba5 ba6 ba7 baC baD baE baF
-
-        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                    // rbga0 rbga1 rbga8 rbga9
-        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                    // rbga2 rbga3 rbgaA rbgaB
-        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                    // rbga4 rbga5 rgbaC rbgaD
-        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                    // rbga6 rbga7 rbgaE rbgaF
-
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)  // rbga0 rbga1 rbga2 rbga3
-        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)  // rbga4 rbga5 rbga6 rbga7
-        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)  // rbga8 rbga9 rbgaA rbgaB
-        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)  // rbgaC rbgaD rbgaE rbgaF
-
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3);         // rgbargbargbargba
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose16_16
-//////////////////////////////////////////////////////////////////////////
-struct Transpose16_16
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalar src = _simd_load_ps((const float*)pSrc);
-
-        simd4scalar comp0 = _simd_extractf128_ps(src, 0);
-        simd4scalar comp1 = _simd_extractf128_ps(src, 1);
-
-        simd4scalari comp0i = SIMD128::castps_si(comp0);
-        simd4scalari comp1i = SIMD128::castps_si(comp1);
-
-        simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
-        simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
-
-        SIMD128::store_si((simd4scalari*)pDst, resLo);
-        SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));       // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1);   // gggggggggggggggg
-
-        simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                    // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                    // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
-        simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)   // rg8 rg9 rgA rgB rgC rgD rgE rgF
-
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0);         // rgrgrgrgrgrgrgrg
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1);         // rgrgrgrgrgrgrgrg
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose24_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose24_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_8_24
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_8_24
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose4_4_4_4
-//////////////////////////////////////////////////////////////////////////
-struct Transpose4_4_4_4
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose5_6_5
-//////////////////////////////////////////////////////////////////////////
-struct Transpose5_6_5
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose9_9_9_5
-//////////////////////////////////////////////////////////////////////////
-struct Transpose9_9_9_5
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose5_5_5_1
-//////////////////////////////////////////////////////////////////////////
-struct Transpose5_5_5_1
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose1_5_5_5
-//////////////////////////////////////////////////////////////////////////
-struct Transpose1_5_5_5
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose10_10_10_2
-//////////////////////////////////////////////////////////////////////////
-struct Transpose10_10_10_2
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose11_11_10
-//////////////////////////////////////////////////////////////////////////
-struct Transpose11_11_10
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64_64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64_64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64_64_64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64_64_64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64_64_64_64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64_64_64_64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
deleted file mode 100644
index 50ea12e0510..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ /dev/null
@@ -1,2385 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file frontend.cpp
- *
- * @brief Implementation for Frontend which handles vertex processing,
- *        primitive assembly, clipping, binning, etc.
- *
- ******************************************************************************/
-
-#include "api.h"
-#include "frontend.h"
-#include "backend.h"
-#include "context.h"
-#include "rdtsc_core.h"
-#include "utils.h"
-#include "threads.h"
-#include "pa.h"
-#include "clip.h"
-#include "tilemgr.h"
-#include "tessellator.h"
-#include <limits>
-#include <iostream>
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief FE handler for SwrSync.
-/// @param pContext - pointer to SWR context.
-/// @param pDC - pointer to draw context.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param pUserData - Pointer to user data passed back to sync callback.
-/// @todo This should go away when we switch this to use compute threading.
-void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
-{
-    BE_WORK work;
-    work.type    = SYNC;
-    work.pfnWork = ProcessSyncBE;
-
-    MacroTileMgr* pTileMgr = pDC->pTileMgr;
-    pTileMgr->enqueue(0, 0, &work);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief FE handler for SwrDestroyContext.
-/// @param pContext - pointer to SWR context.
-/// @param pDC - pointer to draw context.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param pUserData - Pointer to user data passed back to sync callback.
-void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
-{
-    BE_WORK work;
-    work.type    = SHUTDOWN;
-    work.pfnWork = ProcessShutdownBE;
-
-    MacroTileMgr* pTileMgr = pDC->pTileMgr;
-    // Enqueue at least 1 work item for each worker thread
-    // account for number of numa nodes
-    uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
-
-    for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
-    {
-        for (uint32_t n = 0; n < numNumaNodes; ++n)
-        {
-            pTileMgr->enqueue(i, n, &work);
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief FE handler for SwrClearRenderTarget.
-/// @param pContext - pointer to SWR context.
-/// @param pDC - pointer to draw context.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param pUserData - Pointer to user data passed back to clear callback.
-/// @todo This should go away when we switch this to use compute threading.
-void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
-{
-    CLEAR_DESC*   pDesc    = (CLEAR_DESC*)pUserData;
-    MacroTileMgr* pTileMgr = pDC->pTileMgr;
-
-    // queue a clear to each macro tile
-    // compute macro tile bounds for the specified rect
-    uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
-    uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
-    uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
-    uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
-
-    BE_WORK work;
-    work.type       = CLEAR;
-    work.pfnWork    = ProcessClearBE;
-    work.desc.clear = *pDesc;
-
-    for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
-    {
-        for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
-        {
-            pTileMgr->enqueue(x, y, &work);
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief FE handler for SwrStoreTiles.
-/// @param pContext - pointer to SWR context.
-/// @param pDC - pointer to draw context.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param pUserData - Pointer to user data passed back to callback.
-/// @todo This should go away when we switch this to use compute threading.
-void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
-{
-    RDTSC_BEGIN(pContext->pBucketMgr, FEProcessStoreTiles, pDC->drawId);
-    MacroTileMgr*     pTileMgr = pDC->pTileMgr;
-    STORE_TILES_DESC* pDesc    = (STORE_TILES_DESC*)pUserData;
-
-    // queue a store to each macro tile
-    // compute macro tile bounds for the specified rect
-    uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
-    uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
-    uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
-    uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
-
-    // store tiles
-    BE_WORK work;
-    work.type            = STORETILES;
-    work.pfnWork         = ProcessStoreTilesBE;
-    work.desc.storeTiles = *pDesc;
-
-    for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
-    {
-        for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
-        {
-            pTileMgr->enqueue(x, y, &work);
-        }
-    }
-
-    RDTSC_END(pContext->pBucketMgr, FEProcessStoreTiles, 0);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief FE handler for SwrInvalidateTiles.
-/// @param pContext - pointer to SWR context.
-/// @param pDC - pointer to draw context.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param pUserData - Pointer to user data passed back to callback.
-/// @todo This should go away when we switch this to use compute threading.
-void ProcessDiscardInvalidateTiles(SWR_CONTEXT*  pContext,
-                                   DRAW_CONTEXT* pDC,
-                                   uint32_t      workerId,
-                                   void*         pUserData)
-{
-    RDTSC_BEGIN(pContext->pBucketMgr, FEProcessInvalidateTiles, pDC->drawId);
-    DISCARD_INVALIDATE_TILES_DESC* pDesc    = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
-    MacroTileMgr*                  pTileMgr = pDC->pTileMgr;
-
-    // compute macro tile bounds for the specified rect
-    uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
-    uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
-    uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
-    uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
-
-    if (pDesc->fullTilesOnly == false)
-    {
-        // include partial tiles
-        macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
-        macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
-        macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
-        macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
-    }
-
-    SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
-    SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
-
-    macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
-    macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
-
-    // load tiles
-    BE_WORK work;
-    work.type                        = DISCARDINVALIDATETILES;
-    work.pfnWork                     = ProcessDiscardInvalidateTilesBE;
-    work.desc.discardInvalidateTiles = *pDesc;
-
-    for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
-    {
-        for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
-        {
-            pTileMgr->enqueue(x, y, &work);
-        }
-    }
-
-    RDTSC_END(pContext->pBucketMgr, FEProcessInvalidateTiles, 0);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes the number of primitives given the number of verts.
-/// @param mode - primitive topology for draw operation.
-/// @param numPrims - number of vertices or indices for draw.
-/// @todo Frontend needs to be refactored. This will go in appropriate place then.
-uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
-{
-    switch (mode)
-    {
-    case TOP_POINT_LIST:
-        return numPrims;
-    case TOP_TRIANGLE_LIST:
-        return numPrims / 3;
-    case TOP_TRIANGLE_STRIP:
-        return numPrims < 3 ? 0 : numPrims - 2;
-    case TOP_TRIANGLE_FAN:
-        return numPrims < 3 ? 0 : numPrims - 2;
-    case TOP_TRIANGLE_DISC:
-        return numPrims < 2 ? 0 : numPrims - 1;
-    case TOP_QUAD_LIST:
-        return numPrims / 4;
-    case TOP_QUAD_STRIP:
-        return numPrims < 4 ? 0 : (numPrims - 2) / 2;
-    case TOP_LINE_STRIP:
-        return numPrims < 2 ? 0 : numPrims - 1;
-    case TOP_LINE_LIST:
-        return numPrims / 2;
-    case TOP_LINE_LOOP:
-        return numPrims;
-    case TOP_RECT_LIST:
-        return numPrims / 3;
-    case TOP_LINE_LIST_ADJ:
-        return numPrims / 4;
-    case TOP_LISTSTRIP_ADJ:
-        return numPrims < 3 ? 0 : numPrims - 3;
-    case TOP_TRI_LIST_ADJ:
-        return numPrims / 6;
-    case TOP_TRI_STRIP_ADJ:
-        return numPrims < 4 ? 0 : (numPrims / 2) - 2;
-
-    case TOP_PATCHLIST_1:
-    case TOP_PATCHLIST_2:
-    case TOP_PATCHLIST_3:
-    case TOP_PATCHLIST_4:
-    case TOP_PATCHLIST_5:
-    case TOP_PATCHLIST_6:
-    case TOP_PATCHLIST_7:
-    case TOP_PATCHLIST_8:
-    case TOP_PATCHLIST_9:
-    case TOP_PATCHLIST_10:
-    case TOP_PATCHLIST_11:
-    case TOP_PATCHLIST_12:
-    case TOP_PATCHLIST_13:
-    case TOP_PATCHLIST_14:
-    case TOP_PATCHLIST_15:
-    case TOP_PATCHLIST_16:
-    case TOP_PATCHLIST_17:
-    case TOP_PATCHLIST_18:
-    case TOP_PATCHLIST_19:
-    case TOP_PATCHLIST_20:
-    case TOP_PATCHLIST_21:
-    case TOP_PATCHLIST_22:
-    case TOP_PATCHLIST_23:
-    case TOP_PATCHLIST_24:
-    case TOP_PATCHLIST_25:
-    case TOP_PATCHLIST_26:
-    case TOP_PATCHLIST_27:
-    case TOP_PATCHLIST_28:
-    case TOP_PATCHLIST_29:
-    case TOP_PATCHLIST_30:
-    case TOP_PATCHLIST_31:
-    case TOP_PATCHLIST_32:
-        return numPrims / (mode - TOP_PATCHLIST_BASE);
-
-    case TOP_POLYGON:
-    case TOP_POINT_LIST_BF:
-    case TOP_LINE_STRIP_CONT:
-    case TOP_LINE_STRIP_BF:
-    case TOP_LINE_STRIP_CONT_BF:
-    case TOP_TRIANGLE_FAN_NOSTIPPLE:
-    case TOP_TRI_STRIP_REVERSE:
-    case TOP_PATCHLIST_BASE:
-    case TOP_UNKNOWN:
-        SWR_INVALID("Unsupported topology: %d", mode);
-        return 0;
-    }
-
-    return 0;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes the number of verts given the number of primitives.
-/// @param mode - primitive topology for draw operation.
-/// @param numPrims - number of primitives for draw.
-uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
-{
-    switch (mode)
-    {
-    case TOP_POINT_LIST:
-        return numPrims;
-    case TOP_TRIANGLE_LIST:
-        return numPrims * 3;
-    case TOP_TRIANGLE_STRIP:
-        return numPrims ? numPrims + 2 : 0;
-    case TOP_TRIANGLE_FAN:
-        return numPrims ? numPrims + 2 : 0;
-    case TOP_TRIANGLE_DISC:
-        return numPrims ? numPrims + 1 : 0;
-    case TOP_QUAD_LIST:
-        return numPrims * 4;
-    case TOP_QUAD_STRIP:
-        return numPrims ? numPrims * 2 + 2 : 0;
-    case TOP_LINE_STRIP:
-        return numPrims ? numPrims + 1 : 0;
-    case TOP_LINE_LIST:
-        return numPrims * 2;
-    case TOP_LINE_LOOP:
-        return numPrims;
-    case TOP_RECT_LIST:
-        return numPrims * 3;
-    case TOP_LINE_LIST_ADJ:
-        return numPrims * 4;
-    case TOP_LISTSTRIP_ADJ:
-        return numPrims ? numPrims + 3 : 0;
-    case TOP_TRI_LIST_ADJ:
-        return numPrims * 6;
-    case TOP_TRI_STRIP_ADJ:
-        return numPrims ? (numPrims + 2) * 2 : 0;
-
-    case TOP_PATCHLIST_1:
-    case TOP_PATCHLIST_2:
-    case TOP_PATCHLIST_3:
-    case TOP_PATCHLIST_4:
-    case TOP_PATCHLIST_5:
-    case TOP_PATCHLIST_6:
-    case TOP_PATCHLIST_7:
-    case TOP_PATCHLIST_8:
-    case TOP_PATCHLIST_9:
-    case TOP_PATCHLIST_10:
-    case TOP_PATCHLIST_11:
-    case TOP_PATCHLIST_12:
-    case TOP_PATCHLIST_13:
-    case TOP_PATCHLIST_14:
-    case TOP_PATCHLIST_15:
-    case TOP_PATCHLIST_16:
-    case TOP_PATCHLIST_17:
-    case TOP_PATCHLIST_18:
-    case TOP_PATCHLIST_19:
-    case TOP_PATCHLIST_20:
-    case TOP_PATCHLIST_21:
-    case TOP_PATCHLIST_22:
-    case TOP_PATCHLIST_23:
-    case TOP_PATCHLIST_24:
-    case TOP_PATCHLIST_25:
-    case TOP_PATCHLIST_26:
-    case TOP_PATCHLIST_27:
-    case TOP_PATCHLIST_28:
-    case TOP_PATCHLIST_29:
-    case TOP_PATCHLIST_30:
-    case TOP_PATCHLIST_31:
-    case TOP_PATCHLIST_32:
-        return numPrims * (mode - TOP_PATCHLIST_BASE);
-
-    case TOP_POLYGON:
-    case TOP_POINT_LIST_BF:
-    case TOP_LINE_STRIP_CONT:
-    case TOP_LINE_STRIP_BF:
-    case TOP_LINE_STRIP_CONT_BF:
-    case TOP_TRIANGLE_FAN_NOSTIPPLE:
-    case TOP_TRI_STRIP_REVERSE:
-    case TOP_PATCHLIST_BASE:
-    case TOP_UNKNOWN:
-        SWR_INVALID("Unsupported topology: %d", mode);
-        return 0;
-    }
-
-    return 0;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Return number of verts per primitive.
-/// @param topology - topology
-/// @param includeAdjVerts - include adjacent verts in primitive vertices
-uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
-{
-    uint32_t numVerts = 0;
-    switch (topology)
-    {
-    case TOP_POINT_LIST:
-    case TOP_POINT_LIST_BF:
-        numVerts = 1;
-        break;
-    case TOP_LINE_LIST:
-    case TOP_LINE_STRIP:
-    case TOP_LINE_LIST_ADJ:
-    case TOP_LINE_LOOP:
-    case TOP_LINE_STRIP_CONT:
-    case TOP_LINE_STRIP_BF:
-    case TOP_LISTSTRIP_ADJ:
-        numVerts = 2;
-        break;
-    case TOP_TRIANGLE_LIST:
-    case TOP_TRIANGLE_STRIP:
-    case TOP_TRIANGLE_FAN:
-    case TOP_TRI_LIST_ADJ:
-    case TOP_TRI_STRIP_ADJ:
-    case TOP_TRI_STRIP_REVERSE:
-    case TOP_RECT_LIST:
-        numVerts = 3;
-        break;
-    case TOP_QUAD_LIST:
-    case TOP_QUAD_STRIP:
-        numVerts = 4;
-        break;
-    case TOP_PATCHLIST_1:
-    case TOP_PATCHLIST_2:
-    case TOP_PATCHLIST_3:
-    case TOP_PATCHLIST_4:
-    case TOP_PATCHLIST_5:
-    case TOP_PATCHLIST_6:
-    case TOP_PATCHLIST_7:
-    case TOP_PATCHLIST_8:
-    case TOP_PATCHLIST_9:
-    case TOP_PATCHLIST_10:
-    case TOP_PATCHLIST_11:
-    case TOP_PATCHLIST_12:
-    case TOP_PATCHLIST_13:
-    case TOP_PATCHLIST_14:
-    case TOP_PATCHLIST_15:
-    case TOP_PATCHLIST_16:
-    case TOP_PATCHLIST_17:
-    case TOP_PATCHLIST_18:
-    case TOP_PATCHLIST_19:
-    case TOP_PATCHLIST_20:
-    case TOP_PATCHLIST_21:
-    case TOP_PATCHLIST_22:
-    case TOP_PATCHLIST_23:
-    case TOP_PATCHLIST_24:
-    case TOP_PATCHLIST_25:
-    case TOP_PATCHLIST_26:
-    case TOP_PATCHLIST_27:
-    case TOP_PATCHLIST_28:
-    case TOP_PATCHLIST_29:
-    case TOP_PATCHLIST_30:
-    case TOP_PATCHLIST_31:
-    case TOP_PATCHLIST_32:
-        numVerts = topology - TOP_PATCHLIST_BASE;
-        break;
-    default:
-        SWR_INVALID("Unsupported topology: %d", topology);
-        break;
-    }
-
-    if (includeAdjVerts)
-    {
-        switch (topology)
-        {
-        case TOP_LISTSTRIP_ADJ:
-        case TOP_LINE_LIST_ADJ:
-            numVerts = 4;
-            break;
-        case TOP_TRI_STRIP_ADJ:
-        case TOP_TRI_LIST_ADJ:
-            numVerts = 6;
-            break;
-        default:
-            break;
-        }
-    }
-
-    return numVerts;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate mask from remaining work.
-/// @param numWorkItems - Number of items being worked on by a SIMD.
-static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
-{
-    uint32_t numActive =
-        (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
-    uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
-    return _simd_castps_si(_simd_vmask_ps(mask));
-}
-
-static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
-{
-    uint32_t numActive =
-        (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
-    uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
-    return _simd16_castps_si(_simd16_vmask_ps(mask));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief StreamOut - Streams vertex data out to SO buffers.
-///        Generally, we are only streaming out a SIMDs worth of triangles.
-/// @param pDC - pointer to draw context.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
-static void StreamOut(
-    DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
-
-    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    const API_STATE&           state   = GetApiState(pDC);
-    const SWR_STREAMOUT_STATE& soState = state.soState;
-
-    uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
-
-    // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
-    // vertex.
-    uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
-
-    SWR_STREAMOUT_CONTEXT soContext = {0};
-
-    // Setup buffer state pointers.
-    for (uint32_t i = 0; i < 4; ++i)
-    {
-        soContext.pBuffer[i] = &state.soBuffer[i];
-    }
-
-    uint32_t numPrims = pa.NumPrims();
-
-    for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
-    {
-        unsigned long slot = 0;
-        uint64_t soMask = soState.streamMasks[streamIndex];
-
-        // Write all entries into primitive data buffer for SOS.
-        while (_BitScanForward64(&slot, soMask))
-        {
-            simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
-            uint32_t    paSlot = slot + soState.vertexAttribOffset[streamIndex];
-            pa.AssembleSingle(paSlot, primIndex, attrib);
-
-            // Attribute offset is relative offset from start of vertex.
-            // Note that attributes start at slot 1 in the PA buffer. We need to write this
-            // to prim data starting at slot 0. Which is why we do (slot - 1).
-            // Also note: GL works slightly differently, and needs slot 0
-            uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
-
-            // Store each vertex's attrib at appropriate locations in pPrimData buffer.
-            for (uint32_t v = 0; v < soVertsPerPrim; ++v)
-            {
-                uint32_t* pPrimDataAttrib =
-                    pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
-
-                _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
-            }
-
-            soMask &= ~(uint64_t(1) << slot);
-        }
-
-        // Update pPrimData pointer
-        soContext.pPrimData = pPrimData;
-
-        // Call SOS
-        SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
-                   "Trying to execute uninitialized streamout jit function.");
-        state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
-    }
-
-    // Update SO write offset. The driver provides memory for the update.
-    for (uint32_t i = 0; i < 4; ++i)
-    {
-        if (state.soBuffer[i].pWriteOffset)
-        {
-            bool  nullTileAccessed = false;
-            void* pWriteOffset     = pDC->pContext->pfnTranslateGfxptrForWrite(
-                GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData);
-            *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
-        }
-
-        if (state.soBuffer[i].soWriteEnable)
-        {
-            pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
-            pDC->dynState.SoWriteOffsetDirty[i] = true;
-        }
-    }
-
-    pDC->dynState.soPrims += soContext.numPrimsWritten;
-
-    UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
-    UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEStreamout, 1);
-}
-
-#if USE_SIMD16_FRONTEND
-//////////////////////////////////////////////////////////////////////////
-/// Is value an even number (a multiple of two)
-///
-template <typename T>
-INLINE static bool IsEven(T value)
-{
-    return (value & 1) == 0;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Round up value to an even number (a multiple of two)
-///
-template <typename T>
-INLINE static T RoundUpEven(T value)
-{
-    return (value + 1) & ~1;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Round down value to an even number (a multiple of two)
-///
-template <typename T>
-INLINE static T RoundDownEven(T value)
-{
-    return value & ~1;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
-///
-/// vertexCount is in terms of the source simdvertexes and must be even
-///
-/// attribCount will limit the vector copies to those attribs specified
-///
-/// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
-///
-void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex*     vertex_simd16,
-                                           const simdvertex* vertex,
-                                           uint32_t          vertexCount,
-                                           uint32_t          attribCount)
-{
-    SWR_ASSERT(vertex);
-    SWR_ASSERT(vertex_simd16);
-    SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
-
-    simd16vertex temp;
-
-    for (uint32_t i = 0; i < vertexCount; i += 2)
-    {
-        for (uint32_t j = 0; j < attribCount; j += 1)
-        {
-            for (uint32_t k = 0; k < 4; k += 1)
-            {
-                temp.attrib[j][k] =
-                    _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
-
-                if ((i + 1) < vertexCount)
-                {
-                    temp.attrib[j][k] =
-                        _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
-                }
-            }
-        }
-
-        for (uint32_t j = 0; j < attribCount; j += 1)
-        {
-            vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
-        }
-    }
-}
-
-#endif
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes number of invocations. The current index represents
-///        the start of the SIMD. The max index represents how much work
-///        items are remaining. If there is less then a SIMD's xmin of work
-///        then return the remaining amount of work.
-/// @param curIndex - The start index for the SIMD.
-/// @param maxIndex - The last index for all work items.
-static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
-{
-    uint32_t remainder = (maxIndex - curIndex);
-#if USE_SIMD16_FRONTEND
-    return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
-#else
-    return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
-#endif
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Converts a streamId buffer to a cut buffer for the given stream id.
-///        The geometry shader will loop over each active streamout buffer, assembling
-///        primitives for the downstream stages. When multistream output is enabled,
-///        the generated stream ID buffer from the GS needs to be converted to a cut
-///        buffer for the primitive assembler.
-/// @param stream - stream id to generate the cut buffer for
-/// @param pStreamIdBase - pointer to the stream ID buffer
-/// @param numEmittedVerts - Number of total verts emitted by the GS
-/// @param pCutBuffer - output buffer to write cuts to
-void ProcessStreamIdBuffer(uint32_t stream,
-                           uint8_t* pStreamIdBase,
-                           uint32_t numEmittedVerts,
-                           uint8_t* pCutBuffer)
-{
-    SWR_ASSERT(stream < MAX_SO_STREAMS);
-
-    uint32_t numOutputBytes = AlignUp(numEmittedVerts, 8) / 8;
-
-    for (uint32_t b = 0; b < numOutputBytes; ++b)
-    {
-        uint8_t curInputByte = pStreamIdBase[2 * b];
-        uint8_t outByte      = 0;
-        for (uint32_t i = 0; i < 4; ++i)
-        {
-            if ((curInputByte & 0x3) != stream)
-            {
-                outByte |= (1 << i);
-            }
-            curInputByte >>= 2;
-        }
-
-        curInputByte = pStreamIdBase[2 * b + 1];
-        for (uint32_t i = 0; i < 4; ++i)
-        {
-            if ((curInputByte & 0x3) != stream)
-            {
-                outByte |= (1 << (i + 4));
-            }
-            curInputByte >>= 2;
-        }
-
-        *pCutBuffer++ = outByte;
-    }
-}
-
-// Buffers that are allocated if GS is enabled
-struct GsBuffers
-{
-    uint8_t* pGsIn;
-    uint8_t* pGsOut[KNOB_SIMD_WIDTH];
-    uint8_t* pGsTransposed;
-    void*    pStreamCutBuffer;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
-/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
-/// assembler
-/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
-/// @param numVerts - Number of vertices outputted by the GS
-/// @param numAttribs - Number of attributes per vertex
-template <typename SIMD_T, uint32_t SimdWidth>
-void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
-{
-    uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
-    uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
-
-    OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
-
-    for (uint32_t i = 0; i < SimdWidth; ++i)
-    {
-        gatherOffsets[i] = srcVertexStride * i;
-    }
-    auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
-
-    uint32_t numSimd        = AlignUp(numVerts, SimdWidth) / SimdWidth;
-    uint32_t remainingVerts = numVerts;
-
-    for (uint32_t s = 0; s < numSimd; ++s)
-    {
-        uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
-        uint8_t* pDstBase = pDst + s * dstVertexStride;
-
-        // Compute mask to prevent src overflow
-        uint32_t mask = std::min(remainingVerts, SimdWidth);
-        mask          = GenMask(mask);
-        auto vMask    = SIMD_T::vmask_ps(mask);
-        auto viMask   = SIMD_T::castps_si(vMask);
-
-        for (uint32_t a = 0; a < numAttribs; ++a)
-        {
-            auto attribGatherX = SIMD_T::mask_i32gather_ps(
-                SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
-            auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
-                                                           (const float*)(pSrcBase + sizeof(float)),
-                                                           vGatherOffsets,
-                                                           vMask);
-            auto attribGatherZ =
-                SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
-                                          (const float*)(pSrcBase + sizeof(float) * 2),
-                                          vGatherOffsets,
-                                          vMask);
-            auto attribGatherW =
-                SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
-                                          (const float*)(pSrcBase + sizeof(float) * 3),
-                                          vGatherOffsets,
-                                          vMask);
-
-            SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
-            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
-            SIMD_T::maskstore_ps(
-                (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
-            SIMD_T::maskstore_ps(
-                (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
-
-            pSrcBase += sizeof(float) * 4;
-            pDstBase += sizeof(Float<SIMD_T>) * 4;
-        }
-        remainingVerts -= SimdWidth;
-    }
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Implements GS stage.
-/// @param pDC - pointer to draw context.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param pa - The primitive assembly object.
-/// @param pGsOut - output stream for GS
-template <typename HasStreamOutT, typename HasRastT>
-static void GeometryShaderStage(DRAW_CONTEXT* pDC,
-                                uint32_t      workerId,
-                                PA_STATE&     pa,
-                                GsBuffers*    pGsBuffers,
-                                uint32_t*     pSoPrimData,
-#if USE_SIMD16_FRONTEND
-                                uint32_t numPrims_simd8,
-#endif
-                                simdscalari const& primID)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEGeometryShader, pDC->drawId);
-
-    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    const API_STATE&    state  = GetApiState(pDC);
-    const SWR_GS_STATE* pState = &state.gsState;
-    SWR_GS_CONTEXT      gsContext;
-
-    static uint8_t sNullBuffer[128] = {0};
-
-    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
-    {
-        gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
-    }
-    gsContext.pVerts      = (simdvector*)pGsBuffers->pGsIn;
-    gsContext.PrimitiveID = primID;
-
-    uint32_t   numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
-    simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
-
-    // assemble all attributes for the input primitive
-    gsContext.inputVertStride = pState->inputVertStride;
-    for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
-    {
-        uint32_t attribOffset = slot + pState->vertexAttribOffset;
-        pa.Assemble(attribOffset, attrib);
-
-        for (uint32_t i = 0; i < numVertsPerPrim; ++i)
-        {
-            gsContext.pVerts[attribOffset + pState->inputVertStride * i] = attrib[i];
-        }
-    }
-
-    // record valid prims from the frontend to avoid over binning the newly generated
-    // prims from the GS
-#if USE_SIMD16_FRONTEND
-    uint32_t numInputPrims = numPrims_simd8;
-#else
-    uint32_t numInputPrims = pa.NumPrims();
-#endif
-
-    for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
-    {
-        gsContext.InstanceID = instance;
-        gsContext.mask       = GenerateMask(numInputPrims);
-
-        // execute the geometry shader
-        state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
-        AR_EVENT(GSStats((HANDLE)&gsContext.stats));
-
-        for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
-        {
-            gsContext.pStreams[i] += pState->allocationSize;
-        }
-    }
-
-    // set up new binner and state for the GS output topology
-#if USE_SIMD16_FRONTEND
-    PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
-    if (HasRastT::value)
-    {
-        switch (pState->outputTopology)
-        {
-        case TOP_RECT_LIST:
-            pfnClipFunc = ClipRectangles_simd16;
-            break;
-        case TOP_TRIANGLE_STRIP:
-            pfnClipFunc = ClipTriangles_simd16;
-            break;
-        case TOP_LINE_STRIP:
-            pfnClipFunc = ClipLines_simd16;
-            break;
-        case TOP_POINT_LIST:
-            pfnClipFunc = ClipPoints_simd16;
-            break;
-        default:
-            SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
-        }
-    }
-
-#else
-    PFN_PROCESS_PRIMS pfnClipFunc   = nullptr;
-    if (HasRastT::value)
-    {
-        switch (pState->outputTopology)
-        {
-        case TOP_RECT_LIST:
-            pfnClipFunc = ClipRectangles;
-            break;
-        case TOP_TRIANGLE_STRIP:
-            pfnClipFunc = ClipTriangles;
-            break;
-        case TOP_LINE_STRIP:
-            pfnClipFunc = ClipLines;
-            break;
-        case TOP_POINT_LIST:
-            pfnClipFunc = ClipPoints;
-            break;
-        default:
-            SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
-        }
-    }
-
-#endif
-    // foreach input prim:
-    // - setup a new PA based on the emitted verts for that prim
-    // - loop over the new verts, calling PA to assemble each prim
-    uint32_t* pPrimitiveId = (uint32_t*)&primID;
-
-    uint32_t totalPrimsGenerated = 0;
-    for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
-    {
-        uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
-
-        // Vertex count is either emitted by shader or static
-        uint32_t vertexCount = 0;
-        if (pState->staticVertexCount)
-        {
-            vertexCount = pState->staticVertexCount;
-        }
-        else
-        {
-            // If emitted in shader, it should be the stored in the first dword of the output buffer
-            vertexCount = *(uint32_t*)pInstanceBase;
-        }
-
-        for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
-        {
-            uint32_t numEmittedVerts = vertexCount;
-            if (numEmittedVerts == 0)
-            {
-                continue;
-            }
-
-            uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
-            uint8_t* pCutBase =
-                pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
-            uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
-
-#if USE_SIMD16_FRONTEND
-            TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
-                                                          pVertexBaseAOS,
-                                                          vertexCount,
-                                                          pState->outputVertexSize);
-#else
-            TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
-                                                        pVertexBaseAOS,
-                                                        vertexCount,
-                                                        pState->outputVertexSize);
-#endif
-
-            uint32_t numAttribs = state.feNumAttributes;
-
-            for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
-            {
-                bool     processCutVerts = false;
-                uint8_t* pCutBuffer      = pCutBase;
-
-                // assign default stream ID, only relevant when GS is outputting a single stream
-                uint32_t streamID = 0;
-                if (pState->isSingleStream)
-                {
-                    processCutVerts = true;
-                    streamID        = pState->singleStreamID;
-                    if (streamID != stream)
-                        continue;
-                }
-                else
-                {
-                    // early exit if this stream is not enabled for streamout
-                    if (HasStreamOutT::value && !state.soState.streamEnable[stream])
-                    {
-                        continue;
-                    }
-
-                    // multi-stream output, need to translate StreamID buffer to a cut buffer
-                    ProcessStreamIdBuffer(
-                        stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
-                    pCutBuffer      = (uint8_t*)pGsBuffers->pStreamCutBuffer;
-                    processCutVerts = false;
-                }
-
-#if USE_SIMD16_FRONTEND
-                PA_STATE_CUT gsPa(pDC,
-                                  (uint8_t*)pGsBuffers->pGsTransposed,
-                                  numEmittedVerts,
-                                  pState->outputVertexSize,
-                                  reinterpret_cast<simd16mask*>(pCutBuffer),
-                                  numEmittedVerts,
-                                  numAttribs,
-                                  pState->outputTopology,
-                                  processCutVerts,
-                                  pa.numVertsPerPrim);
-
-#else
-                PA_STATE_CUT gsPa(pDC,
-                                  (uint8_t*)pGsBuffers->pGsTransposed,
-                                  numEmittedVerts,
-                                  pState->outputVertexSize,
-                                  pCutBuffer,
-                                  numEmittedVerts,
-                                  numAttribs,
-                                  pState->outputTopology,
-                                  processCutVerts,
-                                  pa.numVertsPerPrim);
-
-#endif
-                while (gsPa.GetNextStreamOutput())
-                {
-                    do
-                    {
-#if USE_SIMD16_FRONTEND
-                        simd16vector attrib_simd16[3];
-
-                        bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
-
-#else
-                        bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
-
-#endif
-                        if (assemble)
-                        {
-                            totalPrimsGenerated += gsPa.NumPrims();
-
-                            if (HasStreamOutT::value)
-                            {
-#if ENABLE_AVX512_SIMD16
-                                gsPa.useAlternateOffset = false;
-#endif
-                                StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
-                            }
-
-                            if (HasRastT::value && state.soState.streamToRasterizer == stream)
-                            {
-#if USE_SIMD16_FRONTEND
-                                simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
-
-                                // Gather data from the SVG if provided.
-                                simd16scalari vViewportIdx = SIMD16::setzero_si();
-                                simd16scalari vRtIdx       = SIMD16::setzero_si();
-                                SIMD16::Vec4  svgAttrib[4];
-
-                                if (state.backendState.readViewportArrayIndex ||
-                                    state.backendState.readRenderTargetArrayIndex)
-                                {
-                                    gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
-                                }
-
-                                if (state.backendState.readViewportArrayIndex)
-                                {
-                                    vViewportIdx =
-                                        SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
-                                    gsPa.viewportArrayActive = true;
-                                }
-                                if (state.backendState.readRenderTargetArrayIndex)
-                                {
-                                    vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
-                                    gsPa.rtArrayActive = true;
-                                }
-
-                                {
-                                    // OOB VPAI indices => forced to zero.
-                                    vViewportIdx =
-                                        SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
-                                    simd16scalari vNumViewports =
-                                        SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simd16scalari vClearMask =
-                                        SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
-                                    vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
-
-                                    gsPa.useAlternateOffset = false;
-                                    pfnClipFunc(pDC,
-                                                gsPa,
-                                                workerId,
-                                                attrib_simd16,
-                                                GenMask(gsPa.NumPrims()),
-                                                vPrimId,
-                                                vViewportIdx,
-                                                vRtIdx);
-                                }
-#else
-                                simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
-
-                                // Gather data from the SVG if provided.
-                                simdscalari vViewportIdx = SIMD::setzero_si();
-                                simdscalari vRtIdx       = SIMD::setzero_si();
-                                SIMD::Vec4  svgAttrib[4];
-
-                                if (state.backendState.readViewportArrayIndex ||
-                                    state.backendState.readRenderTargetArrayIndex)
-                                {
-                                    gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
-                                }
-
-                                if (state.backendState.readViewportArrayIndex)
-                                {
-                                    vViewportIdx =
-                                        SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
-
-                                    // OOB VPAI indices => forced to zero.
-                                    vViewportIdx =
-                                        SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
-                                    simdscalari vNumViewports =
-                                        SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simdscalari vClearMask =
-                                        SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
-                                    vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
-                                    gsPa.viewportArrayActive = true;
-                                }
-                                if (state.backendState.readRenderTargetArrayIndex)
-                                {
-                                    vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
-                                    gsPa.rtArrayActive = true;
-                                }
-
-                                pfnClipFunc(pDC,
-                                            gsPa,
-                                            workerId,
-                                            attrib,
-                                            GenMask(gsPa.NumPrims()),
-                                            vPrimId,
-                                            vViewportIdx,
-                                            vRtIdx);
-#endif
-                            }
-                        }
-                    } while (gsPa.NextPrim());
-                }
-            }
-        }
-    }
-
-    // update GS pipeline stats
-    UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
-    UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
-    AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
-    RDTSC_END(pDC->pContext->pBucketMgr, FEGeometryShader, 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Allocate GS buffers
-/// @param pDC - pointer to draw context.
-/// @param state - API state
-/// @param ppGsOut - pointer to GS output buffer allocation
-/// @param ppCutBuffer - pointer to GS output cut buffer allocation
-template <typename SIMD_T, uint32_t SIMD_WIDTH>
-static INLINE void AllocateGsBuffers(DRAW_CONTEXT*    pDC,
-                                     const API_STATE& state,
-                                     uint32_t         vertsPerPrim,
-                                     GsBuffers*       pGsBuffers)
-{
-    auto pArena = pDC->pArena;
-    SWR_ASSERT(pArena != nullptr);
-    SWR_ASSERT(state.gsState.gsEnable);
-
-    const SWR_GS_STATE& gsState = state.gsState;
-
-    // Allocate storage for vertex inputs
-    uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
-    pGsBuffers->pGsIn           = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
-
-    // Allocate arena space to hold GS output verts
-    const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
-
-    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
-    {
-        pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
-    }
-
-    // Allocate storage for transposed GS output
-    uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
-    uint32_t transposedBufferSize =
-        numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
-    pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
-
-    // Allocate storage to hold temporary stream->cut buffer, if necessary
-    if (state.gsState.isSingleStream)
-    {
-        pGsBuffers->pStreamCutBuffer = nullptr;
-    }
-    else
-    {
-        pGsBuffers->pStreamCutBuffer =
-            (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Contains all data generated by the HS and passed to the
-/// tessellator and DS.
-struct TessellationThreadLocalData
-{
-    SWR_HS_CONTEXT hsContext;
-    void*          pTxCtx;
-    size_t         tsCtxSize;
-
-    uint8_t*    pHSOutput;
-    size_t      hsOutputAllocSize;
-
-    simdscalar* pDSOutput;
-    size_t      dsOutputAllocSize;
-};
-
-THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Allocate tessellation data for this worker thread.
-INLINE
-static void AllocateTessellationData(SWR_CONTEXT* pContext)
-{
-    /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
-    if (gt_pTessellationThreadData == nullptr)
-    {
-        gt_pTessellationThreadData =
-            (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
-        memset((void*)gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Implements Tessellation Stages.
-/// @param pDC - pointer to draw context.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param pa - The primitive assembly object.
-/// @param pGsOut - output stream for GS
-template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
-static void TessellationStages(DRAW_CONTEXT* pDC,
-                               uint32_t      workerId,
-                               PA_STATE&     pa,
-                               GsBuffers*    pGsBuffers,
-                               uint32_t*     pSoPrimData,
-#if USE_SIMD16_FRONTEND
-                               uint32_t numPrims_simd8,
-#endif
-                               simdscalari const& primID)
-{
-    const API_STATE&    state   = GetApiState(pDC);
-    const SWR_TS_STATE& tsState = state.tsState;
-    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    SWR_ASSERT(gt_pTessellationThreadData);
-
-    HANDLE tsCtx = TSInitCtx(tsState.domain,
-                             tsState.partitioning,
-                             tsState.tsOutputTopology,
-                             gt_pTessellationThreadData->pTxCtx,
-                             gt_pTessellationThreadData->tsCtxSize);
-    if (tsCtx == nullptr)
-    {
-        gt_pTessellationThreadData->pTxCtx =
-            AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
-        tsCtx = TSInitCtx(tsState.domain,
-                          tsState.partitioning,
-                          tsState.tsOutputTopology,
-                          gt_pTessellationThreadData->pTxCtx,
-                          gt_pTessellationThreadData->tsCtxSize);
-    }
-    SWR_ASSERT(tsCtx);
-
-#if USE_SIMD16_FRONTEND
-    PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
-    if (HasRastT::value)
-    {
-        switch (tsState.postDSTopology)
-        {
-        case TOP_TRIANGLE_LIST:
-            pfnClipFunc = ClipTriangles_simd16;
-            break;
-        case TOP_LINE_LIST:
-            pfnClipFunc = ClipLines_simd16;
-            break;
-        case TOP_POINT_LIST:
-            pfnClipFunc = ClipPoints_simd16;
-            break;
-        default:
-            SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
-        }
-    }
-
-#else
-    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
-    if (HasRastT::value)
-    {
-        switch (tsState.postDSTopology)
-        {
-        case TOP_TRIANGLE_LIST:
-            pfnClipFunc = ClipTriangles;
-            break;
-        case TOP_LINE_LIST:
-            pfnClipFunc = ClipLines;
-            break;
-        case TOP_POINT_LIST:
-            pfnClipFunc = ClipPoints;
-            break;
-        default:
-            SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
-        }
-    }
-
-#endif
-    SWR_HS_CONTEXT& hsContext       = gt_pTessellationThreadData->hsContext;
-    hsContext.PrimitiveID           = primID;
-    hsContext.outputSize = tsState.hsAllocationSize;
-
-    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
-    // Max storage for one attribute for an entire simdprimitive
-    simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
-
-    // Assemble position separately
-    // TESS_TODO: this could be avoided - fix it
-    pa.Assemble(VERTEX_POSITION_SLOT, simdattrib);
-    for (uint32_t i = 0; i < numVertsPerPrim; ++i) {
-        hsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = simdattrib[i];
-    }
-
-    // assemble all attributes for the input primitives
-    for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
-    {
-        uint32_t attribSlot = tsState.srcVertexAttribOffset + slot;
-        pa.Assemble(attribSlot, simdattrib);
-
-        for (uint32_t i = 0; i < numVertsPerPrim; ++i)
-        {
-            hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i];
-        }
-    }
-
-    // Allocate HS output storage
-    uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize;
-
-    if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize)
-    {
-        AlignedFree(gt_pTessellationThreadData->pHSOutput);
-        gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64);
-        gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize;
-    }
-
-    hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput;
-
-#if defined(_DEBUG)
-    //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
-#endif
-    memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
-
-#if USE_SIMD16_FRONTEND
-    uint32_t numPrims = numPrims_simd8;
-#else
-    uint32_t numPrims = pa.NumPrims();
-#endif
-    hsContext.mask = GenerateMask(numPrims);
-
-    // Run the HS
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEHullShader, pDC->drawId);
-    state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
-    RDTSC_END(pDC->pContext->pBucketMgr, FEHullShader, 0);
-
-    UPDATE_STAT_FE(HsInvocations, numPrims);
-    AR_EVENT(HSStats((HANDLE)&hsContext.stats));
-
-    const uint32_t* pPrimId = (const uint32_t*)&primID;
-
-    for (uint32_t p = 0; p < numPrims; ++p)
-    {
-        ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p);
-
-        SWR_TESSELLATION_FACTORS tessFactors;
-        tessFactors                    = hsContext.pCPout[p].tessFactors;
-
-          // Run Tessellator
-        SWR_TS_TESSELLATED_DATA tsData = {0};
-        RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId);
-        TSTessellate(tsCtx, tessFactors, tsData);
-        AR_EVENT(TessPrimCount(1));
-        RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0);
-
-        if (tsData.NumPrimitives == 0)
-        {
-            continue;
-        }
-        SWR_ASSERT(tsData.NumDomainPoints);
-
-        // Allocate DS Output memory
-        uint32_t requiredDSVectorInvocations =
-            AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
-#if USE_SIMD16_FRONTEND
-        size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
-                                   tsState.dsAllocationSize; // simd8 -> simd16, padding
-#else
-        size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
-        size_t requiredAllocSize       = sizeof(simdvector) * requiredDSOutputVectors;
-#endif
-        if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
-        {
-            AlignedFree(gt_pTessellationThreadData->pDSOutput);
-            gt_pTessellationThreadData->pDSOutput =
-                (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
-            gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
-        }
-        SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
-        SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
-
-#if defined(_DEBUG)
-        memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
-#endif
-
-        // Run Domain Shader
-        SWR_DS_CONTEXT dsContext;
-        dsContext.PrimitiveID           = pPrimId[p];
-        dsContext.pCpIn                 = pCPout;
-        dsContext.pDomainU              = (simdscalar*)tsData.pDomainPointsU;
-        dsContext.pDomainV              = (simdscalar*)tsData.pDomainPointsV;
-        dsContext.pOutputData           = gt_pTessellationThreadData->pDSOutput;
-        dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
-#if USE_SIMD16_FRONTEND
-        dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
-#else
-        dsContext.vectorStride         = requiredDSVectorInvocations;
-#endif
-
-        uint32_t dsInvocations = 0;
-
-        for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
-             ++dsContext.vectorOffset)
-        {
-            dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
-
-            RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEDomainShader, pDC->drawId);
-            state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
-            RDTSC_END(pDC->pContext->pBucketMgr, FEDomainShader, 0);
-
-            AR_EVENT(DSStats((HANDLE)&dsContext.stats));
-
-            dsInvocations += KNOB_SIMD_WIDTH;
-        }
-        UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
-
-#if USE_SIMD16_FRONTEND
-        SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
-
-#endif
-        PA_TESS tessPa(
-            pDC,
-#if USE_SIMD16_FRONTEND
-            reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
-            dsContext.vectorStride / 2,                                   // simd8 -> simd16
-#else
-            dsContext.pOutputData,
-            dsContext.vectorStride,
-#endif
-            SWR_VTX_NUM_SLOTS,
-            tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
-            tsData.ppIndices,
-            tsData.NumPrimitives,
-            tsState.postDSTopology,
-            NumVertsPerPrim(tsState.postDSTopology, false));
-
-        while (tessPa.HasWork())
-        {
-#if USE_SIMD16_FRONTEND
-            const uint32_t numPrims    = tessPa.NumPrims();
-            const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
-            const uint32_t numPrims_hi =
-                std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
-
-            const simd16scalari primID    = _simd16_set1_epi32(dsContext.PrimitiveID);
-            const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
-            const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
-
-#endif
-            if (HasGeometryShaderT::value)
-            {
-#if USE_SIMD16_FRONTEND
-                tessPa.useAlternateOffset = false;
-                GeometryShaderStage<HasStreamOutT, HasRastT>(
-                    pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
-
-                if (numPrims_hi)
-                {
-                    tessPa.useAlternateOffset = true;
-                    GeometryShaderStage<HasStreamOutT, HasRastT>(
-                        pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
-                }
-#else
-                GeometryShaderStage<HasStreamOutT, HasRastT>(
-                    pDC,
-                    workerId,
-                    tessPa,
-                    pGsBuffers,
-                    pSoPrimData,
-                    _simd_set1_epi32(dsContext.PrimitiveID));
-#endif
-            }
-            else
-            {
-                if (HasStreamOutT::value)
-                {
-#if ENABLE_AVX512_SIMD16
-                    tessPa.useAlternateOffset = false;
-#endif
-                    StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
-                }
-
-                if (HasRastT::value)
-                {
-#if USE_SIMD16_FRONTEND
-                    simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
-#else
-                    simdvector prim[3]; // Only deal with triangles, lines, or points
-#endif
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
-                    bool assemble =
-#if USE_SIMD16_FRONTEND
-                        tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
-#else
-                        tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
-#endif
-                    RDTSC_END(pDC->pContext->pBucketMgr, FEPAAssemble, 1);
-                    SWR_ASSERT(assemble);
-
-                    SWR_ASSERT(pfnClipFunc);
-#if USE_SIMD16_FRONTEND
-                    // Gather data from the SVG if provided.
-                    simd16scalari vViewportIdx = SIMD16::setzero_si();
-                    simd16scalari vRtIdx       = SIMD16::setzero_si();
-                    SIMD16::Vec4 svgAttrib[4] = {SIMD16::setzero_ps()};
-
-                    if (state.backendState.readViewportArrayIndex ||
-                        state.backendState.readRenderTargetArrayIndex)
-                    {
-                        tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
-                    }
-
-                    if (state.backendState.readViewportArrayIndex)
-                    {
-                        vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
-                        tessPa.viewportArrayActive = true;
-                    }
-                    if (state.backendState.readRenderTargetArrayIndex)
-                    {
-                        vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
-                        tessPa.rtArrayActive = true;
-                    }
-
-
-                    {
-                        // OOB VPAI indices => forced to zero.
-                        vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
-                        simd16scalari vNumViewports =
-                            SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                        simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
-                        vViewportIdx             = SIMD16::and_si(vClearMask, vViewportIdx);
-
-                        tessPa.useAlternateOffset = false;
-                        pfnClipFunc(pDC,
-                                    tessPa,
-                                    workerId,
-                                    prim_simd16,
-                                    GenMask(numPrims),
-                                    primID,
-                                    vViewportIdx,
-                                    vRtIdx);
-                    }
-#else
-                    // Gather data from the SGV if provided.
-                    simdscalari vViewportIdx = SIMD::setzero_si();
-                    simdscalari vRtIdx       = SIMD::setzero_si();
-                    SIMD::Vec4  svgAttrib[4];
-
-                    if (state.backendState.readViewportArrayIndex ||
-                        state.backendState.readRenderTargetArrayIndex)
-                    {
-                        tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
-                    }
-
-                    if (state.backendState.readViewportArrayIndex)
-                    {
-                        vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
-
-                        // OOB VPAI indices => forced to zero.
-                        vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
-                        simdscalari vNumViewports  = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                        simdscalari vClearMask     = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
-                        vViewportIdx               = SIMD::and_si(vClearMask, vViewportIdx);
-                        tessPa.viewportArrayActive = true;
-                    }
-                    if (state.backendState.readRenderTargetArrayIndex)
-                    {
-                        vRtIdx               = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
-                        tessPa.rtArrayActive = true;
-                    }
-                    pfnClipFunc(pDC,
-                                tessPa,
-                                workerId,
-                                prim,
-                                GenMask(tessPa.NumPrims()),
-                                _simd_set1_epi32(dsContext.PrimitiveID),
-                                vViewportIdx,
-                                vRtIdx);
-#endif
-                }
-            }
-
-            tessPa.NextPrim();
-
-        } // while (tessPa.HasWork())
-    }     // for (uint32_t p = 0; p < numPrims; ++p)
-
-#if USE_SIMD16_FRONTEND
-    if (gt_pTessellationThreadData->pDSOutput != nullptr)
-    {
-        AlignedFree(gt_pTessellationThreadData->pDSOutput);
-        gt_pTessellationThreadData->pDSOutput = nullptr;
-    }
-    gt_pTessellationThreadData->dsOutputAllocSize = 0;
-
-#endif
-    TSDestroyCtx(tsCtx);
-}
-
-THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
-THREAD uint32_t gVertexStoreSize           = 0;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief FE handler for SwrDraw.
-/// @tparam IsIndexedT - Is indexed drawing enabled
-/// @tparam HasTessellationT - Is tessellation enabled
-/// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
-/// @tparam HasStreamOutT - Is stream-out enabled
-/// @tparam HasRastT - Is rasterization enabled
-/// @param pContext - pointer to SWR context.
-/// @param pDC - pointer to draw context.
-/// @param workerId - thread's worker id.
-/// @param pUserData - Pointer to DRAW_WORK
-template <typename IsIndexedT,
-          typename IsCutIndexEnabledT,
-          typename HasTessellationT,
-          typename HasGeometryShaderT,
-          typename HasStreamOutT,
-          typename HasRastT>
-void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
-{
-#if KNOB_ENABLE_TOSS_POINTS
-    if (KNOB_TOSS_QUEUE_FE)
-    {
-        return;
-    }
-#endif
-
-    RDTSC_BEGIN(pContext->pBucketMgr, FEProcessDraw, pDC->drawId);
-
-    void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    DRAW_WORK&       work  = *(DRAW_WORK*)pUserData;
-    const API_STATE& state = GetApiState(pDC);
-
-    uint32_t indexSize = 0;
-    uint32_t endVertex = work.numVerts;
-
-    gfxptr_t xpLastRequestedIndex = 0;
-    if (IsIndexedT::value)
-    {
-        switch (work.type)
-        {
-        case R32_UINT:
-            indexSize = sizeof(uint32_t);
-            break;
-        case R16_UINT:
-            indexSize = sizeof(uint16_t);
-            break;
-        case R8_UINT:
-            indexSize = sizeof(uint8_t);
-            break;
-        default:
-            SWR_INVALID("Invalid work.type: %d", work.type);
-        }
-        xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
-    }
-    else
-    {
-        // No cuts, prune partial primitives.
-        endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
-    }
-
-#if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
-    uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
-#endif
-
-    GsBuffers gsBuffers;
-    if (HasGeometryShaderT::value)
-    {
-#if USE_SIMD16_FRONTEND
-        AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
-            pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
-#else
-        AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
-            pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
-#endif
-    }
-
-    if (HasTessellationT::value)
-    {
-        SWR_ASSERT(state.tsState.tsEnable == true);
-        SWR_ASSERT(state.pfnHsFunc != nullptr);
-        SWR_ASSERT(state.pfnDsFunc != nullptr);
-
-        AllocateTessellationData(pContext);
-    }
-    else
-    {
-        SWR_ASSERT(state.tsState.tsEnable == false);
-        SWR_ASSERT(state.pfnHsFunc == nullptr);
-        SWR_ASSERT(state.pfnDsFunc == nullptr);
-    }
-
-    // allocate space for streamout input prim data
-    uint32_t* pSoPrimData = nullptr;
-    if (HasStreamOutT::value)
-    {
-        pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
-    }
-
-    const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
-#if USE_SIMD16_FRONTEND
-    uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
-#else
-    uint32_t          simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
-#endif
-
-    SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
-
-    // Compute storage requirements for vertex store
-    // TODO: allocation needs to be rethought for better cut support
-    uint32_t numVerts        = vertexCount + 2; // Need extra space for PA state machine
-    uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
-
-    // grow the vertex store for the PA as necessary
-    if (gVertexStoreSize < vertexStoreSize)
-    {
-        if (gpVertexStore != nullptr)
-        {
-            AlignedFree(gpVertexStore);
-            gpVertexStore = nullptr;
-        }
-
-        SWR_ASSERT(gpVertexStore == nullptr);
-
-        gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
-        gVertexStoreSize = vertexStoreSize;
-
-        SWR_ASSERT(gpVertexStore != nullptr);
-    }
-
-    // choose primitive assembler
-
-    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
-                                                         state.topology,
-                                                         work.numVerts,
-                                                         gpVertexStore,
-                                                         numVerts,
-                                                         state.frontendState.vsVertexSize,
-                                                         GetNumVerts(state.topology, 1));
-    PA_STATE&                                  pa = paFactory.GetPA();
-
-#if USE_SIMD16_FRONTEND
-#if USE_SIMD16_SHADERS
-    simd16vertex vin;
-#else
-    simdvertex vin_lo;
-    simdvertex vin_hi;
-#endif
-    SWR_VS_CONTEXT vsContext_lo;
-    SWR_VS_CONTEXT vsContext_hi;
-
-#if USE_SIMD16_SHADERS
-    vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
-    vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
-#else
-    vsContext_lo.pVin = &vin_lo;
-    vsContext_hi.pVin = &vin_hi;
-#endif
-    vsContext_lo.AlternateOffset = 0;
-    vsContext_hi.AlternateOffset = 1;
-
-    SWR_FETCH_CONTEXT fetchInfo_lo = {0};
-
-    fetchInfo_lo.pStreams      = &state.vertexBuffers[0];
-    fetchInfo_lo.StartInstance = work.startInstance;
-    fetchInfo_lo.StartVertex   = 0;
-
-    if (IsIndexedT::value)
-    {
-        fetchInfo_lo.BaseVertex = work.baseVertex;
-
-        // if the entire index buffer isn't being consumed, set the last index
-        // so that fetches < a SIMD wide will be masked off
-        fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
-        if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
-        {
-            fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
-        }
-    }
-    else
-    {
-        fetchInfo_lo.StartVertex = work.startVertex;
-    }
-
-    SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
-
-    const simd16scalari vScale =
-        _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-
-    for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
-    {
-        uint32_t i = 0;
-
-        simd16scalari vIndex;
-
-        if (IsIndexedT::value)
-        {
-            fetchInfo_lo.xpIndices = work.xpIB;
-            fetchInfo_hi.xpIndices =
-                fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
-        }
-        else
-        {
-            vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
-
-            fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
-
-            int32_t* sysAddr = reinterpret_cast<int32_t*>(&vIndex);
-            sysAddr += KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
-
-            fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), sysAddr);
-        }
-
-        fetchInfo_lo.CurInstance = instanceNum;
-        fetchInfo_hi.CurInstance = instanceNum;
-
-        vsContext_lo.InstanceID = instanceNum;
-        vsContext_hi.InstanceID = instanceNum;
-
-        while (pa.HasWork())
-        {
-            // GetNextVsOutput currently has the side effect of updating some PA state machine
-            // state. So we need to keep this outside of (i < endVertex) check.
-
-            simdmask* pvCutIndices_lo = nullptr;
-            simdmask* pvCutIndices_hi = nullptr;
-
-            if (IsIndexedT::value)
-            {
-                // simd16mask <=> simdmask[2]
-
-                pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
-                pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
-            }
-
-            simd16vertex& vout = pa.GetNextVsOutput();
-
-            vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
-            vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
-
-            if (i < endVertex)
-            {
-                if (!IsIndexedT::value)
-                {
-                    fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
-                    uint32_t offset;
-                    offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
-                    offset *= 4; // convert from index to address
-#if USE_SIMD16_SHADERS
-                    fetchInfo_lo.xpLastIndex += offset;
-#else
-                    fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
-                    uint32_t offset2 =
-                        std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
-                    assert(offset >= 0);
-                    fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
-                    fetchInfo_hi.xpLastIndex += offset2;
-#endif
-                }
-                // 1. Execute FS/VS for a single SIMD.
-                RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
-#if USE_SIMD16_SHADERS
-                state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
-#else
-                state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
-
-                if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
-                {
-                    state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
-                }
-#endif
-                RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
-
-                // forward fetch generated vertex IDs to the vertex shader
-#if USE_SIMD16_SHADERS
-#if USE_SIMD16_VS
-                vsContext_lo.VertexID16 =
-                    _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
-                vsContext_lo.VertexID16 =
-                    _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
-#else
-                vsContext_lo.VertexID = fetchInfo_lo.VertexID;
-                vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
-#endif
-#else
-                vsContext_lo.VertexID = fetchInfo_lo.VertexID;
-                vsContext_hi.VertexID = fetchInfo_hi.VertexID;
-#endif
-
-                // Setup active mask for vertex shader.
-#if USE_SIMD16_VS
-                vsContext_lo.mask16 = GenerateMask16(endVertex - i);
-#else
-                vsContext_lo.mask     = GenerateMask(endVertex - i);
-                vsContext_hi.mask     = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
-#endif
-
-                // forward cut mask to the PA
-                if (IsIndexedT::value)
-                {
-#if USE_SIMD16_SHADERS
-                    *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
-                    *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
-#else
-                    *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
-                    *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
-#endif
-                }
-
-                UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
-
-#if KNOB_ENABLE_TOSS_POINTS
-                if (!KNOB_TOSS_FETCH)
-#endif
-                {
-                    RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
-#if USE_SIMD16_VS
-                    state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
-                    AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
-#else
-                    state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
-                    AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
-
-                    if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
-                    {
-                        state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
-                        AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats));
-                    }
-#endif
-                    RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
-
-                    UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
-                }
-            }
-
-            // 2. Assemble primitives given the last two SIMD.
-            do
-            {
-                simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
-
-                RDTSC_START(pContext->pBucketMgr, FEPAAssemble);
-                bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
-                RDTSC_STOP(pContext->pBucketMgr, FEPAAssemble, 1, 0);
-
-#if KNOB_ENABLE_TOSS_POINTS
-                if (!KNOB_TOSS_FETCH)
-#endif
-                {
-#if KNOB_ENABLE_TOSS_POINTS
-                    if (!KNOB_TOSS_VS)
-#endif
-                    {
-                        if (assemble)
-                        {
-                            UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
-
-                            const uint32_t numPrims = pa.NumPrims();
-                            const uint32_t numPrims_lo =
-                                std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
-                            const uint32_t numPrims_hi =
-                                std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
-
-                            const simd16scalari primID    = pa.GetPrimID(work.startPrimID);
-                            const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
-                            const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
-
-                            if (HasTessellationT::value)
-                            {
-                                pa.useAlternateOffset = false;
-                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
-                                    pDC,
-                                    workerId,
-                                    pa,
-                                    &gsBuffers,
-                                    pSoPrimData,
-                                    numPrims_lo,
-                                    primID_lo);
-
-                                if (numPrims_hi)
-                                {
-                                    pa.useAlternateOffset = true;
-                                    TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
-                                        pDC,
-                                        workerId,
-                                        pa,
-                                        &gsBuffers,
-                                        pSoPrimData,
-                                        numPrims_hi,
-                                        primID_hi);
-                                }
-                            }
-                            else if (HasGeometryShaderT::value)
-                            {
-                                pa.useAlternateOffset = false;
-                                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
-                                                                             workerId,
-                                                                             pa,
-                                                                             &gsBuffers,
-                                                                             pSoPrimData,
-                                                                             numPrims_lo,
-                                                                             primID_lo);
-
-                                if (numPrims_hi)
-                                {
-                                    pa.useAlternateOffset = true;
-                                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
-                                                                                 workerId,
-                                                                                 pa,
-                                                                                 &gsBuffers,
-                                                                                 pSoPrimData,
-                                                                                 numPrims_hi,
-                                                                                 primID_hi);
-                                }
-                            }
-                            else
-                            {
-                                // If streamout is enabled then stream vertices out to memory.
-                                if (HasStreamOutT::value)
-                                {
-                                    pa.useAlternateOffset = false;
-                                    StreamOut(pDC, pa, workerId, pSoPrimData, 0);
-                                }
-
-                                if (HasRastT::value)
-                                {
-                                    SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
-                                    // Gather data from the SVG if provided.
-                                    simd16scalari vpai = SIMD16::setzero_si();
-                                    simd16scalari rtai = SIMD16::setzero_si();
-                                    SIMD16::Vec4  svgAttrib[4];
-
-                                    if (state.backendState.readViewportArrayIndex ||
-                                        state.backendState.readRenderTargetArrayIndex)
-                                    {
-                                        pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
-                                    }
-
-                                    if (state.backendState.readViewportArrayIndex)
-                                    {
-                                        vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
-                                        pa.viewportArrayActive = true;
-                                    }
-                                    if (state.backendState.readRenderTargetArrayIndex)
-                                    {
-                                        rtai =
-                                            SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
-                                        pa.rtArrayActive = true;
-                                    }
-
-                                    {
-                                        // OOB VPAI indices => forced to zero.
-                                        vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
-                                        simd16scalari vNumViewports =
-                                            SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                        simd16scalari vClearMask =
-                                            SIMD16::cmplt_epi32(vpai, vNumViewports);
-                                        vpai = SIMD16::and_si(vClearMask, vpai);
-
-                                        pa.useAlternateOffset = false;
-                                        pDC->pState->pfnProcessPrims_simd16(pDC,
-                                                                            pa,
-                                                                            workerId,
-                                                                            prim_simd16,
-                                                                            GenMask(numPrims),
-                                                                            primID,
-                                                                            vpai,
-                                                                            rtai);
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            } while (pa.NextPrim());
-
-            if (IsIndexedT::value)
-            {
-                fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
-                fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
-            }
-            else
-            {
-                vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
-            }
-
-            i += KNOB_SIMD16_WIDTH;
-        }
-
-        pa.Reset();
-    }
-
-#else
-    SWR_VS_CONTEXT    vsContext;
-    SWR_FETCH_CONTEXT fetchInfo = {0};
-
-    fetchInfo.pStreams      = &state.vertexBuffers[0];
-    fetchInfo.StartInstance = work.startInstance;
-    fetchInfo.StartVertex   = 0;
-
-    if (IsIndexedT::value)
-    {
-        fetchInfo.BaseVertex = work.baseVertex;
-
-        // if the entire index buffer isn't being consumed, set the last index
-        // so that fetches < a SIMD wide will be masked off
-        fetchInfo.pLastIndex =
-            (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
-        if (xpLastRequestedIndex < fetchInfo.pLastIndex)
-        {
-            fetchInfo.pLastIndex = xpLastRequestedIndex;
-        }
-    }
-    else
-    {
-        fetchInfo.StartVertex = work.startVertex;
-    }
-
-    const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-
-    /// @todo: temporarily move instance loop in the FE to ensure SO ordering
-    for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
-    {
-        simdscalari vIndex;
-        uint32_t    i = 0;
-
-        if (IsIndexedT::value)
-        {
-            fetchInfo.pIndices = work.pIB;
-        }
-        else
-        {
-            vIndex             = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
-            fetchInfo.pIndices = (const int32_t*)&vIndex;
-        }
-
-        fetchInfo.CurInstance = instanceNum;
-        vsContext.InstanceID  = instanceNum;
-
-        while (pa.HasWork())
-        {
-            // GetNextVsOutput currently has the side effect of updating some PA state machine
-            // state. So we need to keep this outside of (i < endVertex) check.
-            simdmask* pvCutIndices = nullptr;
-            if (IsIndexedT::value)
-            {
-                pvCutIndices = &pa.GetNextVsIndices();
-            }
-
-            simdvertex& vout = pa.GetNextVsOutput();
-            vsContext.pVin   = &vout;
-            vsContext.pVout  = &vout;
-
-            if (i < endVertex)
-            {
-                // 1. Execute FS/VS for a single SIMD.
-                RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
-                state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
-                RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
-
-                // forward fetch generated vertex IDs to the vertex shader
-                vsContext.VertexID = fetchInfo.VertexID;
-
-                // Setup active mask for vertex shader.
-                vsContext.mask = GenerateMask(endVertex - i);
-
-                // forward cut mask to the PA
-                if (IsIndexedT::value)
-                {
-                    *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
-                }
-
-                UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
-
-#if KNOB_ENABLE_TOSS_POINTS
-                if (!KNOB_TOSS_FETCH)
-#endif
-                {
-                    RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
-                    state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
-                    RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
-
-                    UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
-                    AR_EVENT(VSStats((HANDLE)&vsContext.stats));
-                }
-            }
-
-            // 2. Assemble primitives given the last two SIMD.
-            do
-            {
-                simdvector prim[MAX_NUM_VERTS_PER_PRIM];
-                // PaAssemble returns false if there is not enough verts to assemble.
-                RDTSC_BEGIN(pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
-                bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
-                RDTSC_END(pContext->pBucketMgr, FEPAAssemble, 1);
-
-#if KNOB_ENABLE_TOSS_POINTS
-                if (!KNOB_TOSS_FETCH)
-#endif
-                {
-#if KNOB_ENABLE_TOSS_POINTS
-                    if (!KNOB_TOSS_VS)
-#endif
-                    {
-                        if (assemble)
-                        {
-                            UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
-
-                            if (HasTessellationT::value)
-                            {
-                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
-                                    pDC,
-                                    workerId,
-                                    pa,
-                                    &gsBuffers,
-                                    pSoPrimData,
-                                    pa.GetPrimID(work.startPrimID));
-                            }
-                            else if (HasGeometryShaderT::value)
-                            {
-                                GeometryShaderStage<HasStreamOutT, HasRastT>(
-                                    pDC,
-                                    workerId,
-                                    pa,
-                                    &gsBuffers,
-                                    pSoPrimData,
-                                    pa.GetPrimID(work.startPrimID));
-                            }
-                            else
-                            {
-                                // If streamout is enabled then stream vertices out to memory.
-                                if (HasStreamOutT::value)
-                                {
-                                    StreamOut(pDC, pa, workerId, pSoPrimData, 0);
-                                }
-
-                                if (HasRastT::value)
-                                {
-                                    SWR_ASSERT(pDC->pState->pfnProcessPrims);
-
-                                    // Gather data from the SVG if provided.
-                                    simdscalari vViewportIdx = SIMD::setzero_si();
-                                    simdscalari vRtIdx       = SIMD::setzero_si();
-                                    SIMD::Vec4  svgAttrib[4];
-
-                                    if (state.backendState.readViewportArrayIndex ||
-                                        state.backendState.readRenderTargetArrayIndex)
-                                    {
-                                        pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
-                                    }
-
-                                    if (state.backendState.readViewportArrayIndex)
-                                    {
-                                        vViewportIdx =
-                                            SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
-
-                                        // OOB VPAI indices => forced to zero.
-                                        vViewportIdx =
-                                            SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
-                                        simdscalari vNumViewports =
-                                            SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                        simdscalari vClearMask =
-                                            SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
-                                        vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
-                                        pa.viewportArrayActive = true;
-                                    }
-                                    if (state.backendState.readRenderTargetArrayIndex)
-                                    {
-                                        vRtIdx =
-                                            SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
-                                        pa.rtArrayActive = true;
-                                    }
-
-                                    pDC->pState->pfnProcessPrims(pDC,
-                                                                 pa,
-                                                                 workerId,
-                                                                 prim,
-                                                                 GenMask(pa.NumPrims()),
-                                                                 pa.GetPrimID(work.startPrimID),
-                                                                 vViewportIdx,
-                                                                 vRtIdx);
-                                }
-                            }
-                        }
-                    }
-                }
-            } while (pa.NextPrim());
-
-            if (IsIndexedT::value)
-            {
-                fetchInfo.pIndices =
-                    (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
-            }
-            else
-            {
-                vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
-            }
-
-            i += KNOB_SIMD_WIDTH;
-        }
-        pa.Reset();
-    }
-
-#endif
-
-    RDTSC_END(pContext->pBucketMgr, FEProcessDraw, numPrims * work.numInstances);
-}
-
-struct FEDrawChooser
-{
-    typedef PFN_FE_WORK_FUNC FuncType;
-
-    template <typename... ArgsB>
-    static FuncType GetFunc()
-    {
-        return ProcessDraw<ArgsB...>;
-    }
-};
-
-// Selector for correct templated Draw front-end function
-PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
-                                    bool IsCutIndexEnabled,
-                                    bool HasTessellation,
-                                    bool HasGeometryShader,
-                                    bool HasStreamOut,
-                                    bool HasRasterization)
-{
-    return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
-                                                       IsCutIndexEnabled,
-                                                       HasTessellation,
-                                                       HasGeometryShader,
-                                                       HasStreamOut,
-                                                       HasRasterization);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
deleted file mode 100644
index a6d9fb5ba52..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ /dev/null
@@ -1,448 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file frontend.h
- *
- * @brief Definitions for Frontend which handles vertex processing,
- *        primitive assembly, clipping, binning, etc.
- *
- ******************************************************************************/
-#pragma once
-#include "context.h"
-#include "common/simdintrin.h"
-#include <type_traits>
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Helper macro to generate a bitmask
-static INLINE uint32_t
-              GenMask(uint32_t numBits)
-{
-    SWR_ASSERT(
-        numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
-    return ((1U << numBits) - 1);
-}
-
-// Calculates the A and B coefficients for the 3 edges of the triangle
-//
-// maths for edge equations:
-//   standard form of a line in 2d
-//   Ax + By + C = 0
-//   A = y0 - y1
-//   B = x1 - x0
-//   C = x0y1 - x1y0
-INLINE
-void triangleSetupAB(const __m128 vX, const __m128 vY, __m128& vA, __m128& vB)
-{
-    // vYsub = y1 y2 y0 dc
-    __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
-    // vY =    y0 y1 y2 dc
-    vA = _mm_sub_ps(vY, vYsub);
-
-    // Result:
-    // A[0] = y0 - y1
-    // A[1] = y1 - y2
-    // A[2] = y2 - y0
-
-    // vXsub = x1 x2 x0 dc
-    __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
-    // vX =    x0 x1 x2 dc
-    vB = _mm_sub_ps(vXsub, vX);
-
-    // Result:
-    // B[0] = x1 - x0
-    // B[1] = x2 - x1
-    // B[2] = x0 - x2
-}
-
-INLINE
-void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i& vA, __m128i& vB)
-{
-    // generate edge equations
-    // A = y0 - y1
-    // B = x1 - x0
-    // C = x0y1 - x1y0
-    __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
-    vA            = _mm_sub_epi32(vY, vYsub);
-
-    __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
-    vB            = _mm_sub_epi32(vXsub, vX);
-}
-
-INLINE
-void triangleSetupABIntVertical(const simdscalari vX[3],
-                                const simdscalari vY[3],
-                                simdscalari (&vA)[3],
-                                simdscalari (&vB)[3])
-{
-    // A = y0 - y1
-    // B = x1 - x0
-    vA[0] = _simd_sub_epi32(vY[0], vY[1]);
-    vA[1] = _simd_sub_epi32(vY[1], vY[2]);
-    vA[2] = _simd_sub_epi32(vY[2], vY[0]);
-
-    vB[0] = _simd_sub_epi32(vX[1], vX[0]);
-    vB[1] = _simd_sub_epi32(vX[2], vX[1]);
-    vB[2] = _simd_sub_epi32(vX[0], vX[2]);
-}
-
-#if ENABLE_AVX512_SIMD16
-INLINE
-void triangleSetupABIntVertical(const simd16scalari vX[3],
-                                const simd16scalari vY[3],
-                                simd16scalari (&vA)[3],
-                                simd16scalari (&vB)[3])
-{
-    // A = y0 - y1
-    // B = x1 - x0
-    vA[0] = _simd16_sub_epi32(vY[0], vY[1]);
-    vA[1] = _simd16_sub_epi32(vY[1], vY[2]);
-    vA[2] = _simd16_sub_epi32(vY[2], vY[0]);
-
-    vB[0] = _simd16_sub_epi32(vX[1], vX[0]);
-    vB[1] = _simd16_sub_epi32(vX[2], vX[1]);
-    vB[2] = _simd16_sub_epi32(vX[0], vX[2]);
-}
-
-#endif
-// Calculate the determinant of the triangle
-// 2 vectors between the 3 points: P, Q
-// Px = x0-x2, Py = y0-y2
-// Qx = x1-x2, Qy = y1-y2
-//       |Px Qx|
-// det = |     | = PxQy - PyQx
-//       |Py Qy|
-// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
-//               try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
-//               : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
-//               : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
-//               : B[2]*A[1] - A[2]*B[1]
-INLINE
-float calcDeterminantInt(const __m128i vA, const __m128i vB)
-{
-    // vAShuf = [A1, A0, A2, A0]
-    __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
-    // vBShuf = [B2, B0, B1, B0]
-    __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
-    // vMul = [A1*B2, B1*A2]
-    __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
-
-    // shuffle upper to lower
-    // vMul2 = [B1*A2, B1*A2]
-    __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
-    // vMul = [A1*B2 - B1*A2]
-    vMul = _mm_sub_epi64(vMul, vMul2);
-
-    int64_t result;
-    _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
-
-    double dResult = (double)result;
-    dResult        = dResult * (1.0 / FIXED_POINT16_SCALE);
-
-    return (float)dResult;
-}
-
-INLINE
-void calcDeterminantIntVertical(const simdscalari vA[3],
-                                const simdscalari vB[3],
-                                simdscalari*      pvDet)
-{
-    // refer to calcDeterminantInt comment for calculation explanation
-
-    // A1*B2
-    simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
-    simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
-
-    simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
-    simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
-
-    simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
-    simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
-
-    // B1*A2
-    simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
-    simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
-
-    simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
-    simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
-
-    simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
-    simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
-
-    // A1*B2 - A2*B1
-    simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
-    simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
-
-    // shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3
-    simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20);
-
-    // shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7
-    simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31);
-
-    pvDet[0] = vResultLo;
-    pvDet[1] = vResultHi;
-}
-
-#if ENABLE_AVX512_SIMD16
-INLINE
-void calcDeterminantIntVertical(const simd16scalari vA[3],
-                                const simd16scalari vB[3],
-                                simd16scalari*      pvDet)
-{
-    // refer to calcDeterminantInt comment for calculation explanation
-
-    // A1*B2
-    simd16scalari vA1_lo =
-        _simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
-    simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F
-
-    simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
-    simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
-
-    simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b)
-    simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F
-
-    // B1*A2
-    simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
-    simd16scalari vA2_hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
-
-    simd16scalari vB1_lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
-    simd16scalari vB1_hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
-
-    simd16scalari vA2B1_lo = _simd16_mul_epi32(vA2_lo, vB1_lo);
-    simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi);
-
-    // A1*B2 - A2*B1
-    simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b)
-    simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F
-
-    // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
-    simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b)
-    simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F
-
-    // (3, 1, 2, 0) = 11 01 10 00 = 0xD8
-    pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b)
-    pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F
-}
-
-#endif
-INLINE
-void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128& vB, __m128& vC)
-{
-    // C = -Ax - By
-    vC         = _mm_mul_ps(vA, vX);
-    __m128 vCy = _mm_mul_ps(vB, vY);
-    vC         = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
-    vC         = _mm_sub_ps(vC, vCy);
-}
-
-template <uint32_t NumVerts>
-INLINE void viewportTransform(simdvector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
-{
-    simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
-    simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
-    simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]);
-    simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]);
-    simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]);
-    simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]);
-
-    for (uint32_t i = 0; i < NumVerts; ++i)
-    {
-        v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
-        v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
-        v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
-    }
-}
-
-#if USE_SIMD16_FRONTEND
-template <uint32_t NumVerts>
-INLINE void viewportTransform(simd16vector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
-{
-    const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]);
-    const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]);
-    const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]);
-    const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]);
-    const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]);
-    const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]);
-
-    for (uint32_t i = 0; i < NumVerts; ++i)
-    {
-        v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
-        v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
-        v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
-    }
-}
-
-#endif
-template <uint32_t NumVerts>
-INLINE void viewportTransform(simdvector*                  v,
-                              const SWR_VIEWPORT_MATRICES& vpMatrices,
-                              simdscalari const&           vViewportIdx)
-{
-    // perform a gather of each matrix element based on the viewport array indexes
-    simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
-    simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
-    simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
-    simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
-    simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
-    simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
-
-    for (uint32_t i = 0; i < NumVerts; ++i)
-    {
-        v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
-        v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
-        v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
-    }
-}
-
-#if USE_SIMD16_FRONTEND
-template <uint32_t NumVerts>
-INLINE void viewportTransform(simd16vector*                v,
-                              const SWR_VIEWPORT_MATRICES& vpMatrices,
-                              simd16scalari const&         vViewportIdx)
-{
-    // perform a gather of each matrix element based on the viewport array indexes
-    const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
-    const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
-    const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
-    const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
-    const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
-    const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
-
-    for (uint32_t i = 0; i < NumVerts; ++i)
-    {
-        v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
-        v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
-        v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
-    }
-}
-
-#endif
-INLINE
-void calcBoundingBoxInt(const __m128i& vX, const __m128i& vY, SWR_RECT& bbox)
-{
-    // Need horizontal fp min here
-    __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
-    __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
-
-    __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
-    __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
-
-    __m128i vMinX = _mm_min_epi32(vX, vX1);
-    vMinX         = _mm_min_epi32(vMinX, vX2);
-
-    __m128i vMaxX = _mm_max_epi32(vX, vX1);
-    vMaxX         = _mm_max_epi32(vMaxX, vX2);
-
-    __m128i vMinY = _mm_min_epi32(vY, vY1);
-    vMinY         = _mm_min_epi32(vMinY, vY2);
-
-    __m128i vMaxY = _mm_max_epi32(vY, vY1);
-    vMaxY         = _mm_max_epi32(vMaxY, vY2);
-
-    bbox.xmin = _mm_extract_epi32(vMinX, 0);
-    bbox.xmax = _mm_extract_epi32(vMaxX, 0);
-    bbox.ymin = _mm_extract_epi32(vMinY, 0);
-    bbox.ymax = _mm_extract_epi32(vMaxY, 0);
-}
-
-INLINE
-bool CanUseSimplePoints(DRAW_CONTEXT* pDC)
-{
-    const API_STATE& state = GetApiState(pDC);
-
-    return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
-            state.rastState.pointSize == 1.0f && !state.rastState.pointParam &&
-            !state.rastState.pointSpriteEnable && !state.backendState.clipDistanceMask);
-}
-
-INLINE
-bool vHasNaN(const __m128& vec)
-{
-    const __m128  result = _mm_cmpunord_ps(vec, vec);
-    const int32_t mask   = _mm_movemask_ps(result);
-    return (mask != 0);
-}
-
-uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
-uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
-
-// ProcessDraw front-end function.  All combinations of parameter values are available
-PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
-                                    bool IsCutIndexEnabled,
-                                    bool HasTessellation,
-                                    bool HasGeometryShader,
-                                    bool HasStreamOut,
-                                    bool HasRasterization);
-
-void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
-void ProcessStoreTiles(SWR_CONTEXT*  pContext,
-                       DRAW_CONTEXT* pDC,
-                       uint32_t      workerId,
-                       void*         pUserData);
-void ProcessDiscardInvalidateTiles(SWR_CONTEXT*  pContext,
-                                   DRAW_CONTEXT* pDC,
-                                   uint32_t      workerId,
-                                   void*         pUserData);
-void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
-void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
-
-PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
-#if USE_SIMD16_FRONTEND
-PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
-#endif
-
-struct PA_STATE_BASE; // forward decl
-void BinPoints(DRAW_CONTEXT*      pDC,
-               PA_STATE&          pa,
-               uint32_t           workerId,
-               simdvector         prims[3],
-               uint32_t           primMask,
-               simdscalari const& primID,
-               simdscalari const& viewportIdx,
-               simdscalari const& rtIdx);
-void BinLines(DRAW_CONTEXT*      pDC,
-              PA_STATE&          pa,
-              uint32_t           workerId,
-              simdvector         prims[3],
-              uint32_t           primMask,
-              simdscalari const& primID,
-              simdscalari const& viewportIdx,
-              simdscalari const& rtIdx);
-#if USE_SIMD16_FRONTEND
-void SIMDCALL BinPoints_simd16(DRAW_CONTEXT*        pDC,
-                               PA_STATE&            pa,
-                               uint32_t             workerId,
-                               simd16vector         prims[3],
-                               uint32_t             primMask,
-                               simd16scalari const& primID,
-                               simd16scalari const& viewportIdx,
-                               simd16scalari const& rtIdx);
-void SIMDCALL BinLines_simd16(DRAW_CONTEXT*        pDC,
-                              PA_STATE&            pa,
-                              uint32_t             workerId,
-                              simd16vector         prims[3],
-                              uint32_t             primMask,
-                              simd16scalari const& primID,
-                              simd16scalari const& viewportIdx,
-                              simd16scalari const& rtIdx);
-#endif
-
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h
deleted file mode 100644
index 798e5684025..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file knobs.h
- *
- * @brief Static (Compile-Time) Knobs for Core.
- *
- ******************************************************************************/
-#pragma once
-
-#include <stdint.h>
-#include <gen_knobs.h>
-
-#define KNOB_ARCH_AVX 0
-#define KNOB_ARCH_AVX2 1
-#define KNOB_ARCH_AVX512 2
-
-///////////////////////////////////////////////////////////////////////////////
-// AVX512 Support
-///////////////////////////////////////////////////////////////////////////////
-
-#define ENABLE_AVX512_SIMD16 1
-#define USE_SIMD16_FRONTEND 1
-#define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND
-#define USE_SIMD16_VS 1      // requires USE_SIMD16_SHADERS
-
-///////////////////////////////////////////////////////////////////////////////
-// Architecture validation
-///////////////////////////////////////////////////////////////////////////////
-#if !defined(KNOB_ARCH)
-#define KNOB_ARCH KNOB_ARCH_AVX
-#endif
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-#define KNOB_ARCH_ISA AVX
-#define KNOB_ARCH_STR "AVX"
-#elif (KNOB_ARCH == KNOB_ARCH_AVX2)
-#define KNOB_ARCH_ISA AVX2
-#define KNOB_ARCH_STR "AVX2"
-#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
-#define KNOB_ARCH_ISA AVX512F
-#define KNOB_ARCH_STR "AVX512"
-#else
-#error "Unknown architecture"
-#endif
-
-#define KNOB_SIMD_WIDTH 8
-#define KNOB_SIMD_BYTES 32
-
-#define KNOB_SIMD16_WIDTH 16
-#define KNOB_SIMD16_BYTES 64
-
-#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING")
-
-///////////////////////////////////////////////////////////////////////////////
-// Configuration knobs
-///////////////////////////////////////////////////////////////////////////////
-// Maximum supported number of active vertex buffer streams
-#define KNOB_NUM_STREAMS 32
-
-// Maximum supported active viewports and scissors
-#define KNOB_NUM_VIEWPORTS_SCISSORS 16
-
-// Guardband range used by the clipper
-#define KNOB_GUARDBAND_WIDTH 32768.0f
-#define KNOB_GUARDBAND_HEIGHT 32768.0f
-
-// Scratch space requirements per worker. Currently only used for TGSM sizing for some stages
-#define KNOB_WORKER_SCRATCH_SPACE_SIZE (32 * 1024)
-
-///////////////////////////////
-// Macro tile configuration
-///////////////////////////////
-
-// raster tile dimensions
-#define KNOB_TILE_X_DIM 8
-#define KNOB_TILE_X_DIM_SHIFT 3
-#define KNOB_TILE_Y_DIM 8
-#define KNOB_TILE_Y_DIM_SHIFT 3
-
-// fixed macrotile pixel dimension for now, eventually will be
-// dynamically set based on tile format and pixel size
-#define KNOB_MACROTILE_X_DIM 32
-#define KNOB_MACROTILE_Y_DIM 32
-#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 13
-#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 13
-#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8)
-#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8)
-#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT)
-#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT)
-
-// total # of hot tiles available. This should be enough to
-// fully render a 16kx16k 128bpp render target
-#define KNOB_NUM_HOT_TILES_X 512
-#define KNOB_NUM_HOT_TILES_Y 512
-#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT
-#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT
-#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT
-
-// Max scissor rectangle
-#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X* KNOB_MACROTILE_X_DIM
-#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y* KNOB_MACROTILE_Y_DIM
-
-#if KNOB_SIMD_WIDTH == 8 && KNOB_TILE_X_DIM < 4
-#error "incompatible width/tile dimensions"
-#endif
-
-#if ENABLE_AVX512_SIMD16
-#if KNOB_SIMD16_WIDTH == 16 && KNOB_TILE_X_DIM < 8
-#error "incompatible width/tile dimensions"
-#endif
-#endif
-
-#if KNOB_SIMD_WIDTH == 8
-#define SIMD_TILE_X_DIM 4
-#define SIMD_TILE_Y_DIM 2
-#else
-#error "Invalid simd width"
-#endif
-
-#if ENABLE_AVX512_SIMD16
-#if KNOB_SIMD16_WIDTH == 16
-#define SIMD16_TILE_X_DIM 8
-#define SIMD16_TILE_Y_DIM 2
-#else
-#error "Invalid simd width"
-#endif
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-// Optimization knobs
-///////////////////////////////////////////////////////////////////////////////
-#define KNOB_USE_FAST_SRGB TRUE
-
-// enables cut-aware primitive assembler
-#define KNOB_ENABLE_CUT_AWARE_PA TRUE
-
-// enables early rasterization (useful for small triangles)
-#if !defined(KNOB_ENABLE_EARLY_RAST)
-#define KNOB_ENABLE_EARLY_RAST 1
-#endif
-
-#if KNOB_ENABLE_EARLY_RAST
-#define ER_SIMD_TILE_X_SHIFT 2
-#define ER_SIMD_TILE_Y_SHIFT 2
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-// Debug knobs
-///////////////////////////////////////////////////////////////////////////////
-//#define KNOB_ENABLE_RDTSC
-
-// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs.
-#if !defined(KNOB_ENABLE_TOSS_POINTS)
-#define KNOB_ENABLE_TOSS_POINTS 0
-#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
deleted file mode 100644
index f8797a8f2bc..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file knobs_init.h
- *
- * @brief Dynamic Knobs Initialization for Core.
- *
- ******************************************************************************/
-#pragma once
-
-#include <core/knobs.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <stdio.h>
-
-// Assume the type is compatible with a 32-bit integer
-template <typename T>
-static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue)
-{
-    uint32_t value    = 0;
-    char*    pStopped = nullptr;
-    value             = strtoul(pOverride, &pStopped, 0);
-    if (pStopped != pOverride)
-    {
-        knobValue = static_cast<T>(value);
-    }
-}
-
-static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue)
-{
-    size_t len = strlen(pOverride);
-    if (len == 1)
-    {
-        auto c = tolower(pOverride[0]);
-        if (c == 'y' || c == 't' || c == '1')
-        {
-            knobValue = true;
-            return;
-        }
-        if (c == 'n' || c == 'f' || c == '0')
-        {
-            knobValue = false;
-            return;
-        }
-    }
-
-    // Try converting to a number and casting to bool
-    uint32_t value    = 0;
-    char*    pStopped = nullptr;
-    value             = strtoul(pOverride, &pStopped, 0);
-    if (pStopped != pOverride)
-    {
-        knobValue = value != 0;
-    }
-}
-
-static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
-{
-    float value = knobValue;
-    if (sscanf(pOverride, "%f", &value))
-    {
-        knobValue = value;
-    }
-}
-
-static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue)
-{
-    knobValue = pOverride;
-}
-
-template <typename T>
-static inline void InitKnob(T& knob)
-{
-    // Read environment variables
-    const char* pOverride = getenv(knob.Name());
-
-    if (pOverride)
-    {
-        auto knobValue = knob.DefaultValue();
-        ConvertEnvToKnob(pOverride, knobValue);
-        knob.Value(knobValue);
-    }
-    else
-    {
-        // Set default value
-        knob.Value(knob.DefaultValue());
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h
deleted file mode 100644
index 3b23974a7f4..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/multisample.h
+++ /dev/null
@@ -1,459 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file multisample.h
- *
- ******************************************************************************/
-
-#pragma once
-
-#include "context.h"
-#include "format_traits.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedef for testing for single sample case
-typedef std::integral_constant<int, 1> SingleSampleT;
-
-INLINE
-SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
-{
-    switch (numSamples)
-    {
-    case 1:
-        return SWR_MULTISAMPLE_1X;
-    case 2:
-        return SWR_MULTISAMPLE_2X;
-    case 4:
-        return SWR_MULTISAMPLE_4X;
-    case 8:
-        return SWR_MULTISAMPLE_8X;
-    case 16:
-        return SWR_MULTISAMPLE_16X;
-    default:
-        assert(0);
-        return SWR_MULTISAMPLE_1X;
-    }
-}
-
-// hardcoded offsets based on Direct3d standard multisample positions
-// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner
-// coords are 0.8 fixed point offsets from (0, 0)
-template <SWR_MULTISAMPLE_COUNT sampleCount, bool isCenter = false>
-struct MultisampleTraits
-{
-    INLINE static float       X(uint32_t sampleNum) = delete;
-    INLINE static float       Y(uint32_t sampleNum) = delete;
-    INLINE static simdscalari FullSampleMask()      = delete;
-
-    static const uint32_t numSamples = 0;
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_1X, false>
-{
-    INLINE static float       X(uint32_t sampleNum) { return samplePosX[sampleNum]; };
-    INLINE static float       Y(uint32_t sampleNum) { return samplePosY[sampleNum]; };
-    INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); };
-
-    static const uint32_t              numSamples         = 1;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_1X;
-    static constexpr uint32_t          samplePosXi[1]     = {0x80};
-    static constexpr uint32_t          samplePosYi[1]     = {0x80};
-    static constexpr float             samplePosX[1]      = {0.5f};
-    static constexpr float             samplePosY[1]      = {0.5f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_1X, true>
-{
-    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
-    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
-    INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); };
-
-    static const uint32_t              numSamples         = 1;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_1X;
-    static constexpr uint32_t          samplePosXi[1]     = {0x80};
-    static constexpr uint32_t          samplePosYi[1]     = {0x80};
-    static constexpr float             samplePosX[1]      = {0.5f};
-    static constexpr float             samplePosY[1]      = {0.5f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_2X, false>
-{
-    INLINE static float X(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosX[sampleNum];
-    };
-    INLINE static float Y(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosY[sampleNum];
-    };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0x3);
-        return mask;
-    }
-
-    static const uint32_t              numSamples         = 2;
-    static const uint32_t              numCoverageSamples = 2;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_2X;
-    static constexpr uint32_t          samplePosXi[2]     = {0xC0, 0x40};
-    static constexpr uint32_t          samplePosYi[2]     = {0xC0, 0x40};
-    static constexpr float             samplePosX[2]      = {0.75f, 0.25f};
-    static constexpr float             samplePosY[2]      = {0.75f, 0.25f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_2X, true>
-{
-    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
-    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0x3);
-        return mask;
-    }
-    static const uint32_t              numSamples         = 2;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_2X;
-    static constexpr uint32_t          samplePosXi[2]     = {0x80, 0x80};
-    static constexpr uint32_t          samplePosYi[2]     = {0x80, 0x80};
-    static constexpr float             samplePosX[2]      = {0.5f, 0.5f};
-    static constexpr float             samplePosY[2]      = {0.5f, 0.5f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_4X, false>
-{
-    INLINE static float X(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosX[sampleNum];
-    };
-    INLINE static float Y(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosY[sampleNum];
-    };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xF);
-        return mask;
-    }
-
-    static const uint32_t              numSamples         = 4;
-    static const uint32_t              numCoverageSamples = 4;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_4X;
-    static constexpr uint32_t          samplePosXi[4]     = {0x60, 0xE0, 0x20, 0xA0};
-    static constexpr uint32_t          samplePosYi[4]     = {0x20, 0x60, 0xA0, 0xE0};
-    static constexpr float             samplePosX[4]      = {0.375f, 0.875f, 0.125f, 0.625f};
-    static constexpr float             samplePosY[4]      = {0.125f, 0.375f, 0.625f, 0.875f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_4X, true>
-{
-    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
-    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xF);
-        return mask;
-    }
-
-    static const uint32_t              numSamples         = 4;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_4X;
-    static constexpr uint32_t          samplePosXi[4]     = {0x80, 0x80, 0x80, 0x80};
-    static constexpr uint32_t          samplePosYi[4]     = {0x80, 0x80, 0x80, 0x80};
-    static constexpr float             samplePosX[4]      = {0.5f, 0.5f, 0.5f, 0.5f};
-    static constexpr float             samplePosY[4]      = {0.5f, 0.5f, 0.5f, 0.5f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_8X, false>
-{
-    INLINE static float X(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosX[sampleNum];
-    };
-    INLINE static float Y(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosY[sampleNum];
-    };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xFF);
-        return mask;
-    }
-
-    static const uint32_t              numSamples         = 8;
-    static const uint32_t              numCoverageSamples = 8;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_8X;
-    static constexpr uint32_t samplePosXi[8] = {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0};
-    static constexpr uint32_t samplePosYi[8] = {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10};
-    static constexpr float    samplePosX[8]  = {
-        0.5625f, 0.4375f, 0.8125f, 0.3125f, 0.1875f, 0.0625f, 0.6875f, 0.9375f};
-    static constexpr float samplePosY[8] = {
-        0.3125f, 0.6875f, 0.5625f, 0.1875f, 0.8125f, 0.4375f, 0.9375f, 0.0625f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_8X, true>
-{
-    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
-    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xFF);
-        return mask;
-    }
-    static const uint32_t              numSamples         = 8;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_8X;
-    static constexpr uint32_t samplePosXi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-    static constexpr uint32_t samplePosYi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-    static constexpr float    samplePosX[8]  = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
-    static constexpr float    samplePosY[8]  = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_16X, false>
-{
-    INLINE static float X(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosX[sampleNum];
-    };
-    INLINE static float Y(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosY[sampleNum];
-    };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xFFFF);
-        return mask;
-    }
-
-    static const uint32_t              numSamples         = 16;
-    static const uint32_t              numCoverageSamples = 16;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_16X;
-    static constexpr uint32_t          samplePosXi[16]    = {0x90,
-                                                 0x70,
-                                                 0x50,
-                                                 0xC0,
-                                                 0x30,
-                                                 0xA0,
-                                                 0xD0,
-                                                 0xB0,
-                                                 0x60,
-                                                 0x80,
-                                                 0x40,
-                                                 0x20,
-                                                 0x00,
-                                                 0xF0,
-                                                 0xE0,
-                                                 0x10};
-    static constexpr uint32_t          samplePosYi[16]    = {0x90,
-                                                 0x50,
-                                                 0xA0,
-                                                 0x70,
-                                                 0x60,
-                                                 0xD0,
-                                                 0xB0,
-                                                 0x30,
-                                                 0xE0,
-                                                 0x10,
-                                                 0x20,
-                                                 0xC0,
-                                                 0x80,
-                                                 0x40,
-                                                 0xF0,
-                                                 0x00};
-    static constexpr float             samplePosX[16]     = {0.5625f,
-                                             0.4375f,
-                                             0.3125f,
-                                             0.7500f,
-                                             0.1875f,
-                                             0.6250f,
-                                             0.8125f,
-                                             0.6875f,
-                                             0.3750f,
-                                             0.5000f,
-                                             0.2500f,
-                                             0.1250f,
-                                             0.0000f,
-                                             0.9375f,
-                                             0.8750f,
-                                             0.0625f};
-    static constexpr float             samplePosY[16]     = {0.5625f,
-                                             0.3125f,
-                                             0.6250f,
-                                             0.4375f,
-                                             0.3750f,
-                                             0.8125f,
-                                             0.6875f,
-                                             0.1875f,
-                                             0.8750f,
-                                             0.0625f,
-                                             0.1250f,
-                                             0.7500f,
-                                             0.5000f,
-                                             0.2500f,
-                                             0.9375f,
-                                             0.0000f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_16X, true>
-{
-    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
-    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xFFFF);
-        return mask;
-    }
-    static const uint32_t              numSamples         = 16;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_16X;
-    static constexpr uint32_t          samplePosXi[16]    = {0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80};
-    static constexpr uint32_t          samplePosYi[16]    = {0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80};
-    static constexpr float             samplePosX[16]     = {0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f};
-    static constexpr float             samplePosY[16]     = {0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f};
-};
-
-INLINE
-bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount,
-                          const SWR_MULTISAMPLE_POS&  samplePos)
-{
-    // detect if we're using standard or center sample patterns
-    const uint32_t *standardPosX, *standardPosY;
-    switch (sampleCount)
-    {
-    case SWR_MULTISAMPLE_1X:
-        standardPosX = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosXi;
-        standardPosY = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosYi;
-        break;
-    case SWR_MULTISAMPLE_2X:
-        standardPosX = MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi;
-        standardPosY = MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi;
-        break;
-    case SWR_MULTISAMPLE_4X:
-        standardPosX = MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi;
-        standardPosY = MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi;
-        break;
-    case SWR_MULTISAMPLE_8X:
-        standardPosX = MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi;
-        standardPosY = MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi;
-        break;
-    case SWR_MULTISAMPLE_16X:
-        standardPosX = MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi;
-        standardPosY = MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi;
-        break;
-    default:
-        break;
-    }
-
-    // scan sample pattern for standard or center
-    uint32_t numSamples  = GetNumSamples(sampleCount);
-    bool     bIsStandard = true;
-    if (numSamples > 1)
-    {
-        for (uint32_t i = 0; i < numSamples; i++)
-        {
-            bIsStandard =
-                (standardPosX[i] == samplePos.Xi(i)) || (standardPosY[i] == samplePos.Yi(i));
-            if (!bIsStandard)
-                break;
-        }
-    }
-    return !bIsStandard;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
deleted file mode 100644
index adfc1414bae..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ /dev/null
@@ -1,1676 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file pa.h
- *
- * @brief Definitions for primitive assembly.
- *        N primitives are assembled at a time, where N is the SIMD width.
- *        A state machine, that is specific for a given topology, drives the
- *        assembly of vertices into triangles.
- *
- ******************************************************************************/
-#pragma once
-
-#include "frontend.h"
-
-struct PA_STATE
-{
-#if USE_SIMD16_FRONTEND
-    enum
-    {
-        SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
-        SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
-        SIMD_WIDTH_LOG2 = 4
-    };
-
-    typedef simd16mask SIMDMASK;
-
-    typedef simd16scalar SIMDSCALAR;
-    typedef simd16vector SIMDVECTOR;
-    typedef simd16vertex SIMDVERTEX;
-
-    typedef simd16scalari SIMDSCALARI;
-
-#else
-    enum
-    {
-        SIMD_WIDTH      = KNOB_SIMD_WIDTH,
-        SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
-        SIMD_WIDTH_LOG2 = 3
-    };
-
-    typedef simdmask SIMDMASK;
-
-    typedef simdscalar SIMDSCALAR;
-    typedef simdvector SIMDVECTOR;
-    typedef simdvertex SIMDVERTEX;
-
-    typedef simdscalari SIMDSCALARI;
-
-#endif
-    DRAW_CONTEXT* pDC{nullptr};         // draw context
-    uint8_t*      pStreamBase{nullptr}; // vertex stream
-    uint32_t      streamSizeInVerts{0}; // total size of the input stream in verts
-    uint32_t      vertexStride{0};      // stride of a vertex in simdvector units
-
-    // The topology the binner will use. In some cases the FE changes the topology from the api
-    // state.
-    PRIMITIVE_TOPOLOGY binTopology{TOP_UNKNOWN};
-
-#if ENABLE_AVX512_SIMD16
-    bool useAlternateOffset{false};
-#endif
-
-    bool     viewportArrayActive{false};
-    bool     rtArrayActive{false};
-    uint32_t numVertsPerPrim{0};
-
-    PA_STATE() {}
-    PA_STATE(DRAW_CONTEXT* in_pDC,
-             uint8_t*      in_pStreamBase,
-             uint32_t      in_streamSizeInVerts,
-             uint32_t      in_vertexStride,
-             uint32_t      in_numVertsPerPrim) :
-        pDC(in_pDC),
-        pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts),
-        vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim)
-    {
-    }
-
-    virtual bool        HasWork()                                    = 0;
-    virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
-#if ENABLE_AVX512_SIMD16
-    virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
-#endif
-    virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
-#if ENABLE_AVX512_SIMD16
-    virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
-#endif
-    virtual void        AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
-    virtual bool        NextPrim()                                                             = 0;
-    virtual SIMDVERTEX& GetNextVsOutput()                                                      = 0;
-    virtual bool        GetNextStreamOutput()                                                  = 0;
-    virtual SIMDMASK&   GetNextVsIndices()                                                     = 0;
-    virtual uint32_t    NumPrims()                                                             = 0;
-    virtual void        Reset()                                                                = 0;
-    virtual SIMDSCALARI GetPrimID(uint32_t startID)                                            = 0;
-};
-
-// The Optimized PA is a state machine that assembles triangles from vertex shader simd
-// output. Here is the sequence
-//    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
-//    2. Execute PA function to assemble and bin triangles.
-//        a.    The PA function is a set of functions that collectively make up the
-//            state machine for a given topology.
-//                1.    We use a state index to track which PA function to call.
-//        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
-//                1.    We call this the current and previous simd vertex.
-//                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
-//                    order to assemble the second triangle, for a triangle list, we'll need the
-//                    last vertex from the previous simd and the first 2 vertices from the current
-//                    simd.
-//                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
-//
-// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
-// cuts
-struct PA_STATE_OPT : public PA_STATE
-{
-    uint32_t numPrims{0};         // Total number of primitives for draw.
-    uint32_t numPrimsComplete{0}; // Total number of complete primitives.
-
-    uint32_t numSimdPrims{0}; // Number of prims in current simd.
-
-    uint32_t       cur{0};   // index to current VS output.
-    uint32_t       prev{0};  // index to prev VS output. Not really needed in the state.
-    const uint32_t first{0}; // index to first VS output. Used for tri fan and line loop.
-
-    uint32_t counter{0};   // state counter
-    bool     reset{false}; // reset state
-
-    uint32_t    primIDIncr{0}; // how much to increment for each vector (typically vector / {1, 2})
-    SIMDSCALARI primID;
-
-    typedef bool (*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
-    typedef bool (*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
-    typedef void (*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa,
-                                       uint32_t      slot,
-                                       uint32_t      primIndex,
-                                       simd4scalar   verts[]);
-
-    PFN_PA_FUNC pfnPaFunc{nullptr}; // PA state machine function for assembling 4 triangles.
-#if ENABLE_AVX512_SIMD16
-    PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{nullptr};
-#endif
-    PFN_PA_SINGLE_FUNC pfnPaSingleFunc{
-        nullptr}; // PA state machine function for assembling single triangle.
-    PFN_PA_FUNC pfnPaFuncReset{nullptr}; // initial state to set on reset
-#if ENABLE_AVX512_SIMD16
-    PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{nullptr};
-#endif
-
-    // state used to advance the PA when Next is called
-    PFN_PA_FUNC pfnPaNextFunc{nullptr};
-#if ENABLE_AVX512_SIMD16
-    PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{nullptr};
-#endif
-    uint32_t nextNumSimdPrims{0};
-    uint32_t nextNumPrimsIncrement{0};
-    bool     nextReset{false};
-    bool     isStreaming{false};
-
-    SIMDMASK junkIndices{0}; // temporary index store for unused virtual function
-
-    PA_STATE_OPT() {}
-    PA_STATE_OPT(DRAW_CONTEXT*      pDC,
-                 uint32_t           numPrims,
-                 uint8_t*           pStream,
-                 uint32_t           streamSizeInVerts,
-                 uint32_t           vertexStride,
-                 bool               in_isStreaming,
-                 uint32_t           numVertsPerPrim,
-                 PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
-
-    bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; }
-
-    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
-    {
-        SWR_ASSERT(slot < vertexStride);
-        uint32_t    offset     = index * vertexStride + slot;
-        simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
-        return vertexSlot;
-    }
-
-#if ENABLE_AVX512_SIMD16
-    simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
-    {
-        SWR_ASSERT(slot < vertexStride);
-        uint32_t      offset     = index * vertexStride + slot;
-        simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
-        return vertexSlot;
-    }
-
-#endif
-    // Assembles 4 triangles. Each simdvector is a single vertex from 4
-    // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
-    bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); }
-
-#if ENABLE_AVX512_SIMD16
-    bool Assemble(uint32_t slot, simd16vector verts[])
-    {
-        return this->pfnPaFunc_simd16(*this, slot, verts);
-    }
-
-#endif
-    // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
-    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-    {
-        return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
-    }
-
-    bool NextPrim()
-    {
-        this->pfnPaFunc = this->pfnPaNextFunc;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
-#endif
-        this->numSimdPrims = this->nextNumSimdPrims;
-        this->numPrimsComplete += this->nextNumPrimsIncrement;
-        this->reset = this->nextReset;
-
-        if (this->isStreaming)
-        {
-            this->reset = false;
-        }
-
-        bool morePrims = false;
-
-        if (this->numSimdPrims > 0)
-        {
-            morePrims = true;
-            this->numSimdPrims--;
-        }
-        else
-        {
-            this->counter = (this->reset) ? 0 : (this->counter + 1);
-            this->reset   = false;
-        }
-
-        if (!HasWork())
-        {
-            morePrims = false; // no more to do
-        }
-
-        return morePrims;
-    }
-
-    SIMDVERTEX& GetNextVsOutput()
-    {
-        const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
-
-        // increment cur and prev indices
-        if (counter < numSimdVerts)
-        {
-            // prev undefined for first state
-            prev = cur;
-            cur  = counter;
-        }
-        else
-        {
-            // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in
-            // the buffer
-            uint32_t temp = prev;
-
-            prev = cur;
-            cur  = temp;
-        }
-
-        SWR_ASSERT(cur < numSimdVerts);
-        SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
-
-        return *(SIMDVERTEX*)pVertex;
-    }
-
-    SIMDMASK& GetNextVsIndices()
-    {
-        // unused in optimized PA, pass tmp buffer back
-        return junkIndices;
-    }
-
-    bool GetNextStreamOutput()
-    {
-        this->prev = this->cur;
-        this->cur  = this->counter;
-
-        return HasWork();
-    }
-
-    uint32_t NumPrims()
-    {
-        return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims)
-                   ? (SIMD_WIDTH -
-                      (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims))
-                   : SIMD_WIDTH;
-    }
-
-    void SetNextState(PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
-                      PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-                      uint32_t                         numSimdPrims      = 0,
-                      uint32_t                         numPrimsIncrement = 0,
-                      bool                             reset             = false)
-    {
-        this->pfnPaNextFunc         = pfnPaNextFunc;
-        this->nextNumSimdPrims      = numSimdPrims;
-        this->nextNumPrimsIncrement = numPrimsIncrement;
-        this->nextReset             = reset;
-
-        this->pfnPaSingleFunc = pfnPaNextSingleFunc;
-    }
-
-#if ENABLE_AVX512_SIMD16
-    void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
-                             PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
-                             PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-                             uint32_t                         numSimdPrims      = 0,
-                             uint32_t                         numPrimsIncrement = 0,
-                             bool                             reset             = false)
-    {
-        this->pfnPaNextFunc_simd16  = pfnPaNextFunc_simd16;
-        this->pfnPaNextFunc         = pfnPaNextFunc;
-        this->nextNumSimdPrims      = numSimdPrims;
-        this->nextNumPrimsIncrement = numPrimsIncrement;
-        this->nextReset             = reset;
-
-        this->pfnPaSingleFunc = pfnPaNextSingleFunc;
-    }
-
-#endif
-    void Reset()
-    {
-#if ENABLE_AVX512_SIMD16
-        useAlternateOffset = false;
-
-#endif
-        this->pfnPaFunc = this->pfnPaFuncReset;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
-#endif
-        this->numPrimsComplete = 0;
-        this->numSimdPrims     = 0;
-        this->cur              = 0;
-        this->prev             = 0;
-        this->counter          = 0;
-        this->reset            = false;
-    }
-
-    SIMDSCALARI GetPrimID(uint32_t startID)
-    {
-#if USE_SIMD16_FRONTEND
-        return _simd16_add_epi32(
-            this->primID,
-            _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
-#else
-        return _simd_add_epi32(
-            this->primID,
-            _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
-#endif
-    }
-};
-
-// helper C wrappers to avoid having to rewrite all the PA topology state functions
-INLINE void SetNextPaState(PA_STATE_OPT&                    pa,
-                           PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
-                           PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-                           uint32_t                         numSimdPrims      = 0,
-                           uint32_t                         numPrimsIncrement = 0,
-                           bool                             reset             = false)
-{
-    return pa.SetNextState(
-        pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
-}
-
-#if ENABLE_AVX512_SIMD16
-INLINE void SetNextPaState_simd16(PA_STATE_OPT&                    pa,
-                                  PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
-                                  PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
-                                  PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-                                  uint32_t                         numSimdPrims      = 0,
-                                  uint32_t                         numPrimsIncrement = 0,
-                                  bool                             reset             = false)
-{
-    return pa.SetNextState_simd16(pfnPaNextFunc_simd16,
-                                  pfnPaNextFunc,
-                                  pfnPaNextSingleFunc,
-                                  numSimdPrims,
-                                  numPrimsIncrement,
-                                  reset);
-}
-
-#endif
-INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
-{
-    return pa.GetSimdVector(index, slot);
-}
-
-#if ENABLE_AVX512_SIMD16
-INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
-{
-    return pa.GetSimdVector_simd16(index, slot);
-}
-
-#endif
-// Cut-aware primitive assembler.
-struct PA_STATE_CUT : public PA_STATE
-{
-    SIMDMASK* pCutIndices{nullptr};  // cut indices buffer, 1 bit per vertex
-    uint32_t  numVerts{0};           // number of vertices available in buffer store
-    uint32_t  numAttribs{0};         // number of attributes
-    int32_t   numRemainingVerts{0};  // number of verts remaining to be assembled
-    uint32_t  numVertsToAssemble{0}; // total number of verts to assemble for the draw
-#if ENABLE_AVX512_SIMD16
-    OSALIGNSIMD16(uint32_t)
-    indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
-#else
-    OSALIGNSIMD(uint32_t)
-    indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
-#endif
-    SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
-    uint32_t    numPrimsAssembled{0};             // number of primitives that are fully assembled
-    uint32_t    headVertex{0};      // current unused vertex slot in vertex buffer store
-    uint32_t    tailVertex{0};      // beginning vertex currently assembling
-    uint32_t    curVertex{0};       // current unprocessed vertex
-    uint32_t    startPrimId{0};     // starting prim id
-    SIMDSCALARI vPrimId;            // vector of prim ID
-    bool        needOffsets{false}; // need to compute gather offsets for current SIMD
-    uint32_t    vertsPerPrim{0};
-    bool        processCutVerts{
-        false}; // vertex indices with cuts should be processed as normal, otherwise they
-                // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
-                // while the GS sends valid verts for every index
-
-    simdvector junkVector; // junk simdvector for unimplemented API
-#if ENABLE_AVX512_SIMD16
-    simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
-#endif
-
-    // Topology state tracking
-    uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
-    uint32_t curIndex{0};
-    bool     reverseWinding{false}; // indicates reverse winding for strips
-    int32_t  adjExtraVert{0};       // extra vert uses for tristrip w/ adj
-
-    typedef void (PA_STATE_CUT::*PFN_PA_FUNC)(uint32_t vert, bool finish);
-    PFN_PA_FUNC pfnPa{nullptr}; // per-topology function that processes a single vert
-
-    PA_STATE_CUT() {}
-    PA_STATE_CUT(DRAW_CONTEXT*      pDC,
-                 uint8_t*           in_pStream,
-                 uint32_t           in_streamSizeInVerts,
-                 uint32_t           in_vertexStride,
-                 SIMDMASK*          in_pIndices,
-                 uint32_t           in_numVerts,
-                 uint32_t           in_numAttribs,
-                 PRIMITIVE_TOPOLOGY topo,
-                 bool               in_processCutVerts,
-                 uint32_t           in_numVertsPerPrim) :
-        PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
-    {
-        numVerts        = in_streamSizeInVerts;
-        numAttribs      = in_numAttribs;
-        binTopology     = topo;
-        needOffsets     = false;
-        processCutVerts = in_processCutVerts;
-
-        numVertsToAssemble = numRemainingVerts = in_numVerts;
-        numPrimsAssembled                      = 0;
-        headVertex = tailVertex = curVertex = 0;
-
-        curIndex    = 0;
-        pCutIndices = in_pIndices;
-        memset(indices, 0, sizeof(indices));
-#if USE_SIMD16_FRONTEND
-        vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-#else
-        vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-#endif
-        reverseWinding = false;
-        adjExtraVert   = -1;
-
-        bool gsEnabled = pDC->pState->state.gsState.gsEnable;
-        vertsPerPrim   = NumVertsPerPrim(topo, gsEnabled);
-
-        switch (topo)
-        {
-        case TOP_TRIANGLE_LIST:
-            pfnPa = &PA_STATE_CUT::ProcessVertTriList;
-            break;
-        case TOP_TRI_LIST_ADJ:
-            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj
-                              : &PA_STATE_CUT::ProcessVertTriListAdjNoGs;
-            break;
-        case TOP_TRIANGLE_STRIP:
-            pfnPa = &PA_STATE_CUT::ProcessVertTriStrip;
-            break;
-        case TOP_TRI_STRIP_ADJ:
-            if (gsEnabled)
-            {
-                pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<true>;
-            }
-            else
-            {
-                pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<false>;
-            }
-            break;
-
-        case TOP_POINT_LIST:
-            pfnPa = &PA_STATE_CUT::ProcessVertPointList;
-            break;
-        case TOP_LINE_LIST:
-            pfnPa = &PA_STATE_CUT::ProcessVertLineList;
-            break;
-        case TOP_LINE_LIST_ADJ:
-            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj
-                              : &PA_STATE_CUT::ProcessVertLineListAdjNoGs;
-            break;
-        case TOP_LINE_STRIP:
-            pfnPa = &PA_STATE_CUT::ProcessVertLineStrip;
-            break;
-        case TOP_LISTSTRIP_ADJ:
-            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj
-                              : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs;
-            break;
-        case TOP_RECT_LIST:
-            pfnPa = &PA_STATE_CUT::ProcessVertRectList;
-            break;
-        default:
-            assert(0 && "Unimplemented topology");
-        }
-    }
-
-    SIMDVERTEX& GetNextVsOutput()
-    {
-        uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
-        this->headVertex     = (this->headVertex + SIMD_WIDTH) % this->numVerts;
-        this->needOffsets    = true;
-        SIMDVECTOR* pVertex  = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
-
-        return *(SIMDVERTEX*)pVertex;
-    }
-
-    SIMDMASK& GetNextVsIndices()
-    {
-        uint32_t  vertexIndex  = this->headVertex / SIMD_WIDTH;
-        SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
-        return *pCurCutIndex;
-    }
-
-    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
-    {
-        // unused
-        SWR_ASSERT(0 && "Not implemented");
-        return junkVector;
-    }
-
-#if ENABLE_AVX512_SIMD16
-    simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
-    {
-        // unused
-        SWR_ASSERT(0 && "Not implemented");
-        return junkVector_simd16;
-    }
-
-#endif
-    bool GetNextStreamOutput()
-    {
-        this->headVertex += SIMD_WIDTH;
-        this->needOffsets = true;
-        return HasWork();
-    }
-
-    SIMDSCALARI GetPrimID(uint32_t startID)
-    {
-#if USE_SIMD16_FRONTEND
-        return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
-#else
-        return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
-#endif
-    }
-
-    void Reset()
-    {
-#if ENABLE_AVX512_SIMD16
-        useAlternateOffset = false;
-
-#endif
-        this->numRemainingVerts = this->numVertsToAssemble;
-        this->numPrimsAssembled = 0;
-        this->curIndex          = 0;
-        this->curVertex         = 0;
-        this->tailVertex        = 0;
-        this->headVertex        = 0;
-        this->reverseWinding    = false;
-        this->adjExtraVert      = -1;
-#if USE_SIMD16_FRONTEND
-        this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-#else
-        this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-#endif
-    }
-
-    bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; }
-
-    bool IsVertexStoreFull()
-    {
-        return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
-    }
-
-    void RestartTopology()
-    {
-        this->curIndex       = 0;
-        this->reverseWinding = false;
-        this->adjExtraVert   = -1;
-    }
-
-    bool IsCutIndex(uint32_t vertex)
-    {
-        uint32_t vertexIndex  = vertex / SIMD_WIDTH;
-        uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
-        return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
-    }
-
-    // iterates across the unprocessed verts until we hit the end or we
-    // have assembled SIMD prims
-    void ProcessVerts()
-    {
-        while (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0 &&
-               this->curVertex != this->headVertex)
-        {
-            // if cut index, restart topology
-            if (IsCutIndex(this->curVertex))
-            {
-                if (this->processCutVerts)
-                {
-                    (this->*pfnPa)(this->curVertex, false);
-                }
-                // finish off tri strip w/ adj before restarting topo
-                if (this->adjExtraVert != -1)
-                {
-                    (this->*pfnPa)(this->curVertex, true);
-                }
-                RestartTopology();
-            }
-            else
-            {
-                (this->*pfnPa)(this->curVertex, false);
-            }
-
-            this->curVertex++;
-            if (this->curVertex >= this->numVerts)
-            {
-                this->curVertex = 0;
-            }
-            this->numRemainingVerts--;
-        }
-
-        // special case last primitive for tri strip w/ adj
-        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 &&
-            this->adjExtraVert != -1)
-        {
-            (this->*pfnPa)(this->curVertex, true);
-        }
-    }
-
-    void Advance()
-    {
-        // done with current batch
-        // advance tail to the current unsubmitted vertex
-        this->tailVertex        = this->curVertex;
-        this->numPrimsAssembled = 0;
-#if USE_SIMD16_FRONTEND
-        this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
-#else
-        this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
-#endif
-    }
-
-    bool NextPrim()
-    {
-        // if we've assembled enough prims, we can advance to the next set of verts
-        if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
-        {
-            Advance();
-        }
-        return false;
-    }
-
-    void ComputeOffsets()
-    {
-        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
-        {
-            uint32_t    vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
-            SIMDSCALARI vIndices          = *(SIMDSCALARI*)&this->indices[v][0];
-
-            // step to simdvertex batch
-            const uint32_t simdShift = SIMD_WIDTH_LOG2;
-#if USE_SIMD16_FRONTEND
-            SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
-            this->vOffsets[v] =
-                _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
-#else
-            SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
-            this->vOffsets[v] =
-                _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
-#endif
-
-            // step to index
-            const uint32_t simdMask = SIMD_WIDTH - 1;
-#if USE_SIMD16_FRONTEND
-            SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
-            this->vOffsets[v]        = _simd16_add_epi32(
-                this->vOffsets[v],
-                _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
-#else
-            SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
-            this->vOffsets[v] =
-                _simd_add_epi32(this->vOffsets[v],
-                                _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
-#endif
-        }
-    }
-
-    bool Assemble(uint32_t slot, simdvector* verts)
-    {
-        // process any outstanding verts
-        ProcessVerts();
-
-        // return false if we don't have enough prims assembled
-        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
-        {
-            return false;
-        }
-
-        // cache off gather offsets given the current SIMD set of indices the first time we get an
-        // assemble
-        if (this->needOffsets)
-        {
-            ComputeOffsets();
-            this->needOffsets = false;
-        }
-
-        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
-        {
-            SIMDSCALARI offsets = this->vOffsets[v];
-
-            // step to attribute
-#if USE_SIMD16_FRONTEND
-            offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
-#else
-            offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
-#endif
-
-            float* pBase = (float*)this->pStreamBase;
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-#if USE_SIMD16_FRONTEND
-                simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
-
-                // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
-                simdscalar t =
-                    useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
-                verts[v].v[c] = t;
-#else
-                verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
-#endif
-
-                // move base to next component
-                pBase += SIMD_WIDTH;
-            }
-        }
-
-        // compute the implied 4th vertex, v3
-        if (this->binTopology == TOP_RECT_LIST)
-        {
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                // v1, v3 = v1 + v2 - v0, v2
-                // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
-                simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
-                temp              = _simd16_sub_ps(temp, verts[1].v[c]);
-                temp = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
-                verts[1].v[c] = _simd16_extract_ps(temp, 0);
-            }
-        }
-
-        return true;
-    }
-
-#if ENABLE_AVX512_SIMD16
-    bool Assemble(uint32_t slot, simd16vector verts[])
-    {
-       // process any outstanding verts
-        ProcessVerts();
-
-        // return false if we don't have enough prims assembled
-        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
-        {
-            return false;
-        }
-
-        // cache off gather offsets given the current SIMD set of indices the first time we get an
-        // assemble
-        if (this->needOffsets)
-        {
-            ComputeOffsets();
-            this->needOffsets = false;
-        }
-
-        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
-        {
-            SIMDSCALARI offsets = this->vOffsets[v];
-
-            // step to attribute
-#if USE_SIMD16_FRONTEND
-            offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
-#else
-            offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
-#endif
-
-            float* pBase = (float*)this->pStreamBase;
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-#if USE_SIMD16_FRONTEND
-                verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
-#else
-                verts[v].v[c] = _simd16_insert_ps(
-                    _simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
-#endif
-
-                // move base to next component
-                pBase += SIMD_WIDTH;
-            }
-        }
-
-        // compute the implied 4th vertex, v3
-        if (this->binTopology == TOP_RECT_LIST)
-        {
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                // v1, v3 = v1 + v2 - v0, v2
-                // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
-                simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
-                temp              = _simd16_sub_ps(temp, verts[1].v[c]);
-                verts[1].v[c] =
-                    _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
-            }
-        }
-
-        return true;
-    }
-
-#endif
-    void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
-    {
-       // move to slot
-        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
-        {
-            uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
-#if USE_SIMD16_FRONTEND
-            uint32_t offset =
-                useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
-#else
-            uint32_t offset = pOffset[triIndex];
-#endif
-            offset += sizeof(SIMDVECTOR) * slot;
-            float* pVert = (float*)&tri[v];
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                float* pComponent = (float*)(this->pStreamBase + offset);
-                pVert[c]          = *pComponent;
-                offset += SIMD_WIDTH * sizeof(float);
-            }
-        }
-
-        // compute the implied 4th vertex, v3
-        if ((this->binTopology == TOP_RECT_LIST) && (triIndex % 2 == 1))
-        {
-            // v1, v3 = v1 + v2 - v0, v2
-            // v1 stored in tri[0], v0 stored in tri[1], v2 stored in tri[2]
-            float* pVert0 = (float*)&tri[1];
-            float* pVert1 = (float*)&tri[0];
-            float* pVert2 = (float*)&tri[2];
-            float* pVert3 = (float*)&tri[1];
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                pVert3[c] = pVert1[c] + pVert2[c] - pVert0[c];
-            }
-        }
-    }
-
-    uint32_t NumPrims() { return this->numPrimsAssembled; }
-
-    // Per-topology functions
-    void ProcessVertTriStrip(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 3)
-        {
-            // assembled enough verts for prim, add to gather indices
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            if (reverseWinding)
-            {
-                this->indices[1][this->numPrimsAssembled] = this->vert[2];
-                this->indices[2][this->numPrimsAssembled] = this->vert[1];
-            }
-            else
-            {
-                this->indices[1][this->numPrimsAssembled] = this->vert[1];
-                this->indices[2][this->numPrimsAssembled] = this->vert[2];
-            }
-
-            // increment numPrimsAssembled
-            this->numPrimsAssembled++;
-
-            // set up next prim state
-            this->vert[0]  = this->vert[1];
-            this->vert[1]  = this->vert[2];
-            this->curIndex = 2;
-            this->reverseWinding ^= 1;
-        }
-    }
-
-    template <bool gsEnabled>
-    void AssembleTriStripAdj()
-    {
-        if (!gsEnabled)
-        {
-            this->vert[1] = this->vert[2];
-            this->vert[2] = this->vert[4];
-
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-
-            this->vert[4] = this->vert[2];
-            this->vert[2] = this->vert[1];
-        }
-        else
-        {
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-            this->indices[3][this->numPrimsAssembled] = this->vert[3];
-            this->indices[4][this->numPrimsAssembled] = this->vert[4];
-            this->indices[5][this->numPrimsAssembled] = this->vert[5];
-        }
-        this->numPrimsAssembled++;
-    }
-
-    template <bool gsEnabled>
-    void ProcessVertTriStripAdj(uint32_t index, bool finish)
-    {
-        // handle last primitive of tristrip
-        if (finish && this->adjExtraVert != -1)
-        {
-            this->vert[3] = this->adjExtraVert;
-            AssembleTriStripAdj<gsEnabled>();
-            this->adjExtraVert = -1;
-            return;
-        }
-
-        switch (this->curIndex)
-        {
-        case 0:
-        case 1:
-        case 2:
-        case 4:
-            this->vert[this->curIndex] = index;
-            this->curIndex++;
-            break;
-        case 3:
-            this->vert[5] = index;
-            this->curIndex++;
-            break;
-        case 5:
-            if (this->adjExtraVert == -1)
-            {
-                this->adjExtraVert = index;
-            }
-            else
-            {
-                this->vert[3] = index;
-                if (!gsEnabled)
-                {
-                    AssembleTriStripAdj<gsEnabled>();
-
-                    uint32_t nextTri[6];
-                    if (this->reverseWinding)
-                    {
-                        nextTri[0] = this->vert[4];
-                        nextTri[1] = this->vert[0];
-                        nextTri[2] = this->vert[2];
-                        nextTri[4] = this->vert[3];
-                        nextTri[5] = this->adjExtraVert;
-                    }
-                    else
-                    {
-                        nextTri[0] = this->vert[2];
-                        nextTri[1] = this->adjExtraVert;
-                        nextTri[2] = this->vert[3];
-                        nextTri[4] = this->vert[4];
-                        nextTri[5] = this->vert[0];
-                    }
-                    for (uint32_t i = 0; i < 6; ++i)
-                    {
-                        this->vert[i] = nextTri[i];
-                    }
-
-                    this->adjExtraVert = -1;
-                    this->reverseWinding ^= 1;
-                }
-                else
-                {
-                    this->curIndex++;
-                }
-            }
-            break;
-        case 6:
-            SWR_ASSERT(this->adjExtraVert != -1, "Algorithm failure!");
-            AssembleTriStripAdj<gsEnabled>();
-
-            uint32_t nextTri[6];
-            if (this->reverseWinding)
-            {
-                nextTri[0] = this->vert[4];
-                nextTri[1] = this->vert[0];
-                nextTri[2] = this->vert[2];
-                nextTri[4] = this->vert[3];
-                nextTri[5] = this->adjExtraVert;
-            }
-            else
-            {
-                nextTri[0] = this->vert[2];
-                nextTri[1] = this->adjExtraVert;
-                nextTri[2] = this->vert[3];
-                nextTri[4] = this->vert[4];
-                nextTri[5] = this->vert[0];
-            }
-            for (uint32_t i = 0; i < 6; ++i)
-            {
-                this->vert[i] = nextTri[i];
-            }
-            this->reverseWinding ^= 1;
-            this->adjExtraVert = index;
-            this->curIndex--;
-            break;
-        }
-    }
-
-    void ProcessVertTriList(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 3)
-        {
-            // assembled enough verts for prim, add to gather indices
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-
-            // increment numPrimsAssembled
-            this->numPrimsAssembled++;
-
-            // set up next prim state
-            this->curIndex = 0;
-        }
-    }
-
-    void ProcessVertTriListAdj(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 6)
-        {
-            // assembled enough verts for prim, add to gather indices
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-            this->indices[3][this->numPrimsAssembled] = this->vert[3];
-            this->indices[4][this->numPrimsAssembled] = this->vert[4];
-            this->indices[5][this->numPrimsAssembled] = this->vert[5];
-
-            // increment numPrimsAssembled
-            this->numPrimsAssembled++;
-
-            // set up next prim state
-            this->curIndex = 0;
-        }
-    }
-
-    void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 6)
-        {
-            // assembled enough verts for prim, add to gather indices
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->indices[1][this->numPrimsAssembled] = this->vert[2];
-            this->indices[2][this->numPrimsAssembled] = this->vert[4];
-
-            // increment numPrimsAssembled
-            this->numPrimsAssembled++;
-
-            // set up next prim state
-            this->curIndex = 0;
-        }
-    }
-
-    void ProcessVertLineList(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 2)
-        {
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-
-            this->numPrimsAssembled++;
-            this->curIndex = 0;
-        }
-    }
-
-    void ProcessVertLineStrip(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 2)
-        {
-            // assembled enough verts for prim, add to gather indices
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-
-            // increment numPrimsAssembled
-            this->numPrimsAssembled++;
-
-            // set up next prim state
-            this->vert[0]  = this->vert[1];
-            this->curIndex = 1;
-        }
-    }
-
-    void ProcessVertLineStripAdj(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 4)
-        {
-            // assembled enough verts for prim, add to gather indices
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-            this->indices[3][this->numPrimsAssembled] = this->vert[3];
-
-            // increment numPrimsAssembled
-            this->numPrimsAssembled++;
-
-            // set up next prim state
-            this->vert[0]  = this->vert[1];
-            this->vert[1]  = this->vert[2];
-            this->vert[2]  = this->vert[3];
-            this->curIndex = 3;
-        }
-    }
-
-    void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 4)
-        {
-            // assembled enough verts for prim, add to gather indices
-            this->indices[0][this->numPrimsAssembled] = this->vert[1];
-            this->indices[1][this->numPrimsAssembled] = this->vert[2];
-
-            // increment numPrimsAssembled
-            this->numPrimsAssembled++;
-
-            // set up next prim state
-            this->vert[0]  = this->vert[1];
-            this->vert[1]  = this->vert[2];
-            this->vert[2]  = this->vert[3];
-            this->curIndex = 3;
-        }
-    }
-
-    void ProcessVertLineListAdj(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 4)
-        {
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-            this->indices[3][this->numPrimsAssembled] = this->vert[3];
-
-            this->numPrimsAssembled++;
-            this->curIndex = 0;
-        }
-    }
-
-    void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 4)
-        {
-            this->indices[0][this->numPrimsAssembled] = this->vert[1];
-            this->indices[1][this->numPrimsAssembled] = this->vert[2];
-
-            this->numPrimsAssembled++;
-            this->curIndex = 0;
-        }
-    }
-
-    void ProcessVertPointList(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 1)
-        {
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->numPrimsAssembled++;
-            this->curIndex = 0;
-        }
-    }
-
-    void ProcessVertRectList(uint32_t index, bool finish)
-    {
-        this->vert[this->curIndex] = index;
-        this->curIndex++;
-        if (this->curIndex == 3)
-        {
-            // assembled enough verts for prim, add to gather indices
-            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-
-            // second triangle in the rectangle
-            // v1, v3 = v1 + v2 - v0, v2
-            this->indices[0][this->numPrimsAssembled + 1] = this->vert[1];
-            this->indices[1][this->numPrimsAssembled + 1] = this->vert[0];
-            this->indices[2][this->numPrimsAssembled + 1] = this->vert[2];
-
-            // increment numPrimsAssembled
-            this->numPrimsAssembled += 2;
-
-            // set up next prim state
-            this->curIndex = 0;
-        }
-    }
-};
-
-// Primitive Assembly for data output from the DomainShader.
-struct PA_TESS : PA_STATE
-{
-    PA_TESS(DRAW_CONTEXT*     in_pDC,
-            const SIMDSCALAR* in_pVertData,
-            uint32_t          in_attributeStrideInVectors,
-            uint32_t          in_vertexStride,
-            uint32_t          in_numAttributes,
-            uint32_t* (&in_ppIndices)[3],
-            uint32_t           in_numPrims,
-            PRIMITIVE_TOPOLOGY in_binTopology,
-            uint32_t           numVertsPerPrim,
-            bool               SOA = true) :
-
-        PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
-        m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors),
-        m_numAttributes(in_numAttributes), m_numPrims(in_numPrims), m_SOA(SOA)
-    {
-#if USE_SIMD16_FRONTEND
-        m_vPrimId = _simd16_setzero_si();
-#else
-        m_vPrimId = _simd_setzero_si();
-#endif
-        binTopology    = in_binTopology;
-        m_ppIndices[0] = in_ppIndices[0];
-        m_ppIndices[1] = in_ppIndices[1];
-        m_ppIndices[2] = in_ppIndices[2];
-
-        switch (binTopology)
-        {
-        case TOP_POINT_LIST:
-            m_numVertsPerPrim = 1;
-            break;
-
-        case TOP_LINE_LIST:
-            m_numVertsPerPrim = 2;
-            break;
-
-        case TOP_TRIANGLE_LIST:
-            m_numVertsPerPrim = 3;
-            break;
-
-        default:
-            SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
-            break;
-        }
-    }
-
-    bool HasWork() { return m_numPrims != 0; }
-
-    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
-    {
-        SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
-        return junkVector;
-    }
-
-#if ENABLE_AVX512_SIMD16
-    simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
-    {
-        SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
-        return junkVector_simd16;
-    }
-
-#endif
-    static SIMDSCALARI GenPrimMask(uint32_t numPrims)
-    {
-        SWR_ASSERT(numPrims <= SIMD_WIDTH);
-#if USE_SIMD16_FRONTEND
-        static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = {
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0};
-
-        return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
-#else
-        static const OSALIGNLINE(int32_t)
-            maskGen[SIMD_WIDTH * 2] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
-
-        return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
-#endif
-    }
-
-    bool Assemble(uint32_t slot, simdvector verts[])
-    {
-        SWR_ASSERT(slot < m_numAttributes);
-
-        uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
-        if (0 == numPrimsToAssemble)
-        {
-            return false;
-        }
-
-        SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
-
-        const float* pBaseAttrib;
-        if (m_SOA)
-        {
-            pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
-        }
-        else
-        {
-            const float* pVertData = (const float*)m_pVertexData;
-            pBaseAttrib            = pVertData + slot * 4;
-        }
-
-        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
-        {
-#if USE_SIMD16_FRONTEND
-            SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
-#else
-            SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
-#endif
-
-            const float* pBase = pBaseAttrib;
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-#if USE_SIMD16_FRONTEND
-                simd16scalar temp =
-                    _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
-                                              pBase,
-                                              indices,
-                                              _simd16_castsi_ps(mask),
-                                              4 /* gcc doesn't like sizeof(float) */);
-
-                verts[i].v[c] =
-                    useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
-#else
-                verts[i].v[c] = _simd_mask_i32gather_ps(_simd_setzero_ps(),
-                                                        pBase,
-                                                        indices,
-                                                        _simd_castsi_ps(mask),
-                                                        4); // gcc doesn't like sizeof(float)
-#endif
-                if (m_SOA)
-                {
-                    pBase += m_attributeStrideInVectors * SIMD_WIDTH;
-                }
-                else
-                {
-                    pBase += sizeof(float);
-                }
-            }
-        }
-
-        return true;
-    }
-
-#if ENABLE_AVX512_SIMD16
-    bool Assemble(uint32_t slot, simd16vector verts[])
-    {
-        SWR_ASSERT(slot < m_numAttributes);
-
-        uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
-        if (0 == numPrimsToAssemble)
-        {
-            return false;
-        }
-
-        SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
-
-        const float* pBaseAttrib;
-        if (m_SOA)
-        {
-            pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
-        }
-        else
-        {
-            const float* pVertData = (const float*)m_pVertexData;
-            pBaseAttrib            = pVertData + slot * 4;
-        }
-
-        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
-        {
-#if USE_SIMD16_FRONTEND
-            SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
-            if (!m_SOA)
-            {
-                indices = _simd16_mullo_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
-            }
-#else
-            SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
-#endif
-
-            const float* pBase = pBaseAttrib;
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-#if USE_SIMD16_FRONTEND
-                verts[i].v[c] = _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
-                                                          pBase,
-                                                          indices,
-                                                          _simd16_castsi_ps(mask),
-                                                          4 /* gcc doesn't like sizeof(float) */);
-#else
-                simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(),
-                                                          pBase,
-                                                          indices,
-                                                          _simd_castsi_ps(mask),
-                                                          4 /* gcc doesn't like sizeof(float) */);
-                verts[i].v[c]   = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
-#endif
-                if (m_SOA)
-                {
-                    pBase += m_attributeStrideInVectors * SIMD_WIDTH;
-                }
-                else
-                {
-                    pBase++;
-                }
-            }
-        }
-
-        return true;
-    }
-
-#endif
-    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-    {
-        SWR_ASSERT(slot < m_numAttributes);
-
-
-        SWR_ASSERT(primIndex < PA_TESS::NumPrims());
-
-        const float* pVertDataBase;
-        if (m_SOA)
-        {
-            pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
-        }
-        else
-        {
-            const float* pVertData = (const float*)m_pVertexData;
-            pVertDataBase          = pVertData + slot * 4;
-        };
-        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
-        {
-#if USE_SIMD16_FRONTEND
-            uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2]
-                                                : m_ppIndices[i][primIndex];
-            if (!m_SOA)
-            {
-                index *= (vertexStride / 4);
-            }
-#else
-            uint32_t index = m_ppIndices[i][primIndex];
-#endif
-            const float* pVertData = pVertDataBase;
-            float*       pVert     = (float*)&verts[i];
-
-            for (uint32_t c = 0; c < 4; ++c)
-            {
-                pVert[c] = pVertData[index];
-                if (m_SOA)
-                {
-                    pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
-                }
-                else
-                {
-                    pVertData++;
-                }
-            }
-
-        }
-    }
-
-    bool NextPrim()
-    {
-        uint32_t numPrims = PA_TESS::NumPrims();
-        m_numPrims -= numPrims;
-        m_ppIndices[0] += numPrims;
-        m_ppIndices[1] += numPrims;
-        m_ppIndices[2] += numPrims;
-
-        return HasWork();
-    }
-
-    SIMDVERTEX& GetNextVsOutput()
-    {
-        SWR_NOT_IMPL;
-        return junkVertex;
-    }
-
-    bool GetNextStreamOutput()
-    {
-        SWR_NOT_IMPL;
-        return false;
-    }
-
-    SIMDMASK& GetNextVsIndices()
-    {
-        SWR_NOT_IMPL;
-        return junkIndices;
-    }
-
-    uint32_t NumPrims() { return std::min<uint32_t>(m_numPrims, SIMD_WIDTH); }
-
-    void Reset() { SWR_NOT_IMPL; }
-
-    SIMDSCALARI GetPrimID(uint32_t startID)
-    {
-#if USE_SIMD16_FRONTEND
-        return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
-#else
-        return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
-#endif
-    }
-
-private:
-    const SIMDSCALAR* m_pVertexData              = nullptr;
-    uint32_t          m_attributeStrideInVectors = 0;
-    uint32_t          m_numAttributes            = 0;
-    uint32_t          m_numPrims                 = 0;
-    uint32_t*         m_ppIndices[3];
-
-    uint32_t m_numVertsPerPrim = 0;
-
-    SIMDSCALARI m_vPrimId;
-
-    simdvector junkVector; // junk simdvector for unimplemented API
-#if ENABLE_AVX512_SIMD16
-    simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
-#endif
-    SIMDVERTEX junkVertex;  // junk SIMDVERTEX for unimplemented API
-    SIMDMASK   junkIndices; // temporary index store for unused virtual function
-
-    bool m_SOA;
-};
-
-// Primitive Assembler factory class, responsible for creating and initializing the correct
-// assembler based on state.
-template <typename IsIndexedT, typename IsCutIndexEnabledT>
-struct PA_FACTORY
-{
-    PA_FACTORY(DRAW_CONTEXT*         pDC,
-               PRIMITIVE_TOPOLOGY    in_topo,
-               uint32_t              numVerts,
-               PA_STATE::SIMDVERTEX* pVertexStore,
-               uint32_t              vertexStoreSize,
-               uint32_t              vertexStride,
-               uint32_t              numVertsPerPrim) :
-        topo(in_topo)
-    {
-#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
-        const API_STATE& state = GetApiState(pDC);
-        if ((IsIndexedT::value && IsCutIndexEnabledT::value &&
-             (topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST ||
-              topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) ||
-
-            // non-indexed draws with adjacency topologies must use cut-aware PA until we add
-            // support for them in the optimized PA
-            (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
-             topo == TOP_TRI_STRIP_ADJ))
-        {
-            memset(&indexStore, 0, sizeof(indexStore));
-            uint32_t numAttribs = state.feNumAttributes;
-
-            new (&this->paCut) PA_STATE_CUT(pDC,
-                                            reinterpret_cast<uint8_t*>(pVertexStore),
-                                            vertexStoreSize * PA_STATE::SIMD_WIDTH,
-                                            vertexStride,
-                                            &this->indexStore[0],
-                                            numVerts,
-                                            numAttribs,
-                                            state.topology,
-                                            false,
-                                            numVertsPerPrim);
-            cutPA = true;
-        }
-        else
-#endif
-        {
-            uint32_t numPrims = GetNumPrims(in_topo, numVerts);
-            new (&this->paOpt) PA_STATE_OPT(pDC,
-                                            numPrims,
-                                            reinterpret_cast<uint8_t*>(pVertexStore),
-                                            vertexStoreSize * PA_STATE::SIMD_WIDTH,
-                                            vertexStride,
-                                            false,
-                                            numVertsPerPrim);
-            cutPA = false;
-        }
-    }
-
-    PA_STATE& GetPA()
-    {
-#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
-        if (cutPA)
-        {
-            return this->paCut;
-        }
-        else
-#endif
-        {
-            return this->paOpt;
-        }
-    }
-
-    PA_STATE_OPT paOpt;
-    PA_STATE_CUT paCut;
-
-    bool cutPA{false};
-
-    PRIMITIVE_TOPOLOGY topo{TOP_UNKNOWN};
-
-    PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
deleted file mode 100644
index 25d7156ac63..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
+++ /dev/null
@@ -1,3141 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file pa_avx.cpp
- *
- * @brief AVX implementation for primitive assembly.
- *        N primitives are assembled at a time, where N is the SIMD width.
- *        A state machine, that is specific for a given topology, drives the
- *        assembly of vertices into triangles.
- *
- ******************************************************************************/
-#include "context.h"
-#include "pa.h"
-#include "frontend.h"
-
-#if (KNOB_SIMD_WIDTH == 8)
-
-INLINE simd4scalar swizzleLane0(const simdscalar& x,
-                                const simdscalar& y,
-                                const simdscalar& z,
-                                const simdscalar& w)
-{
-    simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
-    simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
-    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
-}
-
-INLINE simd4scalar swizzleLane1(const simdscalar& x,
-                                const simdscalar& y,
-                                const simdscalar& z,
-                                const simdscalar& w)
-{
-    simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
-    simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
-    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
-}
-
-INLINE simd4scalar swizzleLane2(const simdscalar& x,
-                                const simdscalar& y,
-                                const simdscalar& z,
-                                const simdscalar& w)
-{
-    simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
-    simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
-    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
-}
-
-INLINE simd4scalar swizzleLane3(const simdscalar& x,
-                                const simdscalar& y,
-                                const simdscalar& z,
-                                const simdscalar& w)
-{
-    simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
-    simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
-    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
-}
-
-INLINE simd4scalar swizzleLane4(const simdscalar& x,
-                                const simdscalar& y,
-                                const simdscalar& z,
-                                const simdscalar& w)
-{
-    simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
-    simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
-    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
-}
-
-INLINE simd4scalar swizzleLane5(const simdscalar& x,
-                                const simdscalar& y,
-                                const simdscalar& z,
-                                const simdscalar& w)
-{
-    simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
-    simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
-    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
-}
-
-INLINE simd4scalar swizzleLane6(const simdscalar& x,
-                                const simdscalar& y,
-                                const simdscalar& z,
-                                const simdscalar& w)
-{
-    simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
-    simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
-    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
-}
-
-INLINE simd4scalar swizzleLane7(const simdscalar& x,
-                                const simdscalar& y,
-                                const simdscalar& z,
-                                const simdscalar& w)
-{
-    simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
-    simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
-    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
-}
-
-INLINE simd4scalar swizzleLane0(const simdvector& v)
-{
-    return swizzleLane0(v.x, v.y, v.z, v.w);
-}
-
-INLINE simd4scalar swizzleLane1(const simdvector& v)
-{
-    return swizzleLane1(v.x, v.y, v.z, v.w);
-}
-
-INLINE simd4scalar swizzleLane2(const simdvector& v)
-{
-    return swizzleLane2(v.x, v.y, v.z, v.w);
-}
-
-INLINE simd4scalar swizzleLane3(const simdvector& v)
-{
-    return swizzleLane3(v.x, v.y, v.z, v.w);
-}
-
-INLINE simd4scalar swizzleLane4(const simdvector& v)
-{
-    return swizzleLane4(v.x, v.y, v.z, v.w);
-}
-
-INLINE simd4scalar swizzleLane5(const simdvector& v)
-{
-    return swizzleLane5(v.x, v.y, v.z, v.w);
-}
-
-INLINE simd4scalar swizzleLane6(const simdvector& v)
-{
-    return swizzleLane6(v.x, v.y, v.z, v.w);
-}
-
-INLINE simd4scalar swizzleLane7(const simdvector& v)
-{
-    return swizzleLane7(v.x, v.y, v.z, v.w);
-}
-
-INLINE simd4scalar swizzleLaneN(const simdvector& v, int lane)
-{
-    switch (lane)
-    {
-    case 0:
-        return swizzleLane0(v);
-    case 1:
-        return swizzleLane1(v);
-    case 2:
-        return swizzleLane2(v);
-    case 3:
-        return swizzleLane3(v);
-    case 4:
-        return swizzleLane4(v);
-    case 5:
-        return swizzleLane5(v);
-    case 6:
-        return swizzleLane6(v);
-    case 7:
-        return swizzleLane7(v);
-    default:
-        return _mm_setzero_ps();
-    }
-}
-
-#if ENABLE_AVX512_SIMD16
-INLINE simd4scalar swizzleLane0(const simd16vector& v)
-{
-    return swizzleLane0(_simd16_extract_ps(v.x, 0),
-                        _simd16_extract_ps(v.y, 0),
-                        _simd16_extract_ps(v.z, 0),
-                        _simd16_extract_ps(v.w, 0));
-}
-
-INLINE simd4scalar swizzleLane1(const simd16vector& v)
-{
-    return swizzleLane1(_simd16_extract_ps(v.x, 0),
-                        _simd16_extract_ps(v.y, 0),
-                        _simd16_extract_ps(v.z, 0),
-                        _simd16_extract_ps(v.w, 0));
-}
-
-INLINE simd4scalar swizzleLane2(const simd16vector& v)
-{
-    return swizzleLane2(_simd16_extract_ps(v.x, 0),
-                        _simd16_extract_ps(v.y, 0),
-                        _simd16_extract_ps(v.z, 0),
-                        _simd16_extract_ps(v.w, 0));
-}
-
-INLINE simd4scalar swizzleLane3(const simd16vector& v)
-{
-    return swizzleLane3(_simd16_extract_ps(v.x, 0),
-                        _simd16_extract_ps(v.y, 0),
-                        _simd16_extract_ps(v.z, 0),
-                        _simd16_extract_ps(v.w, 0));
-}
-
-INLINE simd4scalar swizzleLane4(const simd16vector& v)
-{
-    return swizzleLane4(_simd16_extract_ps(v.x, 0),
-                        _simd16_extract_ps(v.y, 0),
-                        _simd16_extract_ps(v.z, 0),
-                        _simd16_extract_ps(v.w, 0));
-}
-
-INLINE simd4scalar swizzleLane5(const simd16vector& v)
-{
-    return swizzleLane5(_simd16_extract_ps(v.x, 0),
-                        _simd16_extract_ps(v.y, 0),
-                        _simd16_extract_ps(v.z, 0),
-                        _simd16_extract_ps(v.w, 0));
-}
-
-INLINE simd4scalar swizzleLane6(const simd16vector& v)
-{
-    return swizzleLane6(_simd16_extract_ps(v.x, 0),
-                        _simd16_extract_ps(v.y, 0),
-                        _simd16_extract_ps(v.z, 0),
-                        _simd16_extract_ps(v.w, 0));
-}
-
-INLINE simd4scalar swizzleLane7(const simd16vector& v)
-{
-    return swizzleLane7(_simd16_extract_ps(v.x, 0),
-                        _simd16_extract_ps(v.y, 0),
-                        _simd16_extract_ps(v.z, 0),
-                        _simd16_extract_ps(v.w, 0));
-}
-
-INLINE simd4scalar swizzleLane8(const simd16vector& v)
-{
-    return swizzleLane0(_simd16_extract_ps(v.x, 1),
-                        _simd16_extract_ps(v.y, 1),
-                        _simd16_extract_ps(v.z, 1),
-                        _simd16_extract_ps(v.w, 1));
-}
-
-INLINE simd4scalar swizzleLane9(const simd16vector& v)
-{
-    return swizzleLane1(_simd16_extract_ps(v.x, 1),
-                        _simd16_extract_ps(v.y, 1),
-                        _simd16_extract_ps(v.z, 1),
-                        _simd16_extract_ps(v.w, 1));
-}
-
-INLINE simd4scalar swizzleLaneA(const simd16vector& v)
-{
-    return swizzleLane2(_simd16_extract_ps(v.x, 1),
-                        _simd16_extract_ps(v.y, 1),
-                        _simd16_extract_ps(v.z, 1),
-                        _simd16_extract_ps(v.w, 1));
-}
-
-INLINE simd4scalar swizzleLaneB(const simd16vector& v)
-{
-    return swizzleLane3(_simd16_extract_ps(v.x, 1),
-                        _simd16_extract_ps(v.y, 1),
-                        _simd16_extract_ps(v.z, 1),
-                        _simd16_extract_ps(v.w, 1));
-}
-
-INLINE simd4scalar swizzleLaneC(const simd16vector& v)
-{
-    return swizzleLane4(_simd16_extract_ps(v.x, 1),
-                        _simd16_extract_ps(v.y, 1),
-                        _simd16_extract_ps(v.z, 1),
-                        _simd16_extract_ps(v.w, 1));
-}
-
-INLINE simd4scalar swizzleLaneD(const simd16vector& v)
-{
-    return swizzleLane5(_simd16_extract_ps(v.x, 1),
-                        _simd16_extract_ps(v.y, 1),
-                        _simd16_extract_ps(v.z, 1),
-                        _simd16_extract_ps(v.w, 1));
-}
-
-INLINE simd4scalar swizzleLaneE(const simd16vector& v)
-{
-    return swizzleLane6(_simd16_extract_ps(v.x, 1),
-                        _simd16_extract_ps(v.y, 1),
-                        _simd16_extract_ps(v.z, 1),
-                        _simd16_extract_ps(v.w, 1));
-}
-
-INLINE simd4scalar swizzleLaneF(const simd16vector& v)
-{
-    return swizzleLane7(_simd16_extract_ps(v.x, 1),
-                        _simd16_extract_ps(v.y, 1),
-                        _simd16_extract_ps(v.z, 1),
-                        _simd16_extract_ps(v.w, 1));
-}
-
-INLINE simd4scalar swizzleLaneN(const simd16vector& v, int lane)
-{
-    switch (lane)
-    {
-    case 0:
-        return swizzleLane0(v);
-    case 1:
-        return swizzleLane1(v);
-    case 2:
-        return swizzleLane2(v);
-    case 3:
-        return swizzleLane3(v);
-    case 4:
-        return swizzleLane4(v);
-    case 5:
-        return swizzleLane5(v);
-    case 6:
-        return swizzleLane6(v);
-    case 7:
-        return swizzleLane7(v);
-    case 8:
-        return swizzleLane8(v);
-    case 9:
-        return swizzleLane9(v);
-    case 10:
-        return swizzleLaneA(v);
-    case 11:
-        return swizzleLaneB(v);
-    case 12:
-        return swizzleLaneC(v);
-    case 13:
-        return swizzleLaneD(v);
-    case 14:
-        return swizzleLaneE(v);
-    case 15:
-        return swizzleLaneF(v);
-    default:
-        return _mm_setzero_ps();
-    }
-}
-
-#endif
-bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
-bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
-void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
-
-bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
-bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
-void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
-
-bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
-bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
-void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
-
-bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
-bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
-void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
-
-bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
-bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
-void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
-
-bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
-bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
-void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
-
-bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
-bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
-void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
-
-bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
-bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
-void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
-
-bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
-bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
-void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
-
-template <uint32_t TotalControlPoints>
-void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-{
-    // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
-    // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
-    // Each attribute has 4 components.
-
-    /// @todo Optimize this
-
-#if USE_SIMD16_FRONTEND
-    if (pa.useAlternateOffset)
-    {
-        primIndex += KNOB_SIMD_WIDTH;
-    }
-
-#endif
-    float* pOutVec = (float*)verts;
-
-    for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
-    {
-        uint32_t input_cp = primIndex * TotalControlPoints + cp;
-#if USE_SIMD16_FRONTEND
-        uint32_t input_vec  = input_cp / KNOB_SIMD16_WIDTH;
-        uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
-
-#else
-        uint32_t input_vec  = input_cp / KNOB_SIMD_WIDTH;
-        uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
-
-#endif
-        // Loop over all components of the attribute
-        for (uint32_t i = 0; i < 4; ++i)
-        {
-#if USE_SIMD16_FRONTEND
-            const float* pInputVec =
-                (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
-#else
-            const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
-#endif
-            pOutVec[cp * 4 + i] = pInputVec[input_lane];
-        }
-    }
-}
-
-template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
-static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SetNextPaState(pa,
-                   PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
-                   PaPatchListSingle<TotalControlPoints>);
-
-    return false;
-}
-
-template <uint32_t TotalControlPoints>
-static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
-    // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
-    // Each attribute has 4 components.
-
-    /// @todo Optimize this
-
-#if USE_SIMD16_FRONTEND
-    uint32_t lane_offset = 0;
-
-    if (pa.useAlternateOffset)
-    {
-        lane_offset = KNOB_SIMD_WIDTH;
-    }
-
-#endif
-    // Loop over all components of the attribute
-    for (uint32_t i = 0; i < 4; ++i)
-    {
-        for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
-        {
-            float vec[KNOB_SIMD_WIDTH];
-            for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
-            {
-#if USE_SIMD16_FRONTEND
-                uint32_t input_cp   = (lane + lane_offset) * TotalControlPoints + cp;
-                uint32_t input_vec  = input_cp / KNOB_SIMD16_WIDTH;
-                uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
-
-                const float* pInputVec =
-                    (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
-#else
-                uint32_t input_cp   = lane * TotalControlPoints + cp;
-                uint32_t input_vec  = input_cp / KNOB_SIMD_WIDTH;
-                uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
-
-                const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
-#endif
-                vec[lane] = pInputVec[input_lane];
-            }
-            verts[cp][i] = _simd_loadu_ps(vec);
-        }
-    }
-
-    SetNextPaState(pa,
-                   PaPatchList<TotalControlPoints>,
-                   PaPatchListSingle<TotalControlPoints>,
-                   0,
-                   PA_STATE_OPT::SIMD_WIDTH,
-                   true);
-
-    return true;
-}
-
-#if ENABLE_AVX512_SIMD16
-template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
-static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SetNextPaState_simd16(pa,
-                          PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
-                          PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
-                          PaPatchListSingle<TotalControlPoints>);
-
-    return false;
-}
-
-template <uint32_t TotalControlPoints>
-static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
-    // KNOB_SIMD16_WIDTH * 1 patch.  This function is called once per attribute.
-    // Each attribute has 4 components.
-
-    /// @todo Optimize this
-
-    // Loop over all components of the attribute
-    for (uint32_t i = 0; i < 4; ++i)
-    {
-        for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
-        {
-            float vec[KNOB_SIMD16_WIDTH];
-            for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane)
-            {
-                uint32_t input_cp   = lane * TotalControlPoints + cp;
-                uint32_t input_vec  = input_cp / KNOB_SIMD16_WIDTH;
-                uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
-
-                const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
-                vec[lane]              = pInputVec[input_lane];
-            }
-            verts[cp][i] = _simd16_loadu_ps(vec);
-        }
-    }
-
-    SetNextPaState_simd16(pa,
-                          PaPatchList_simd16<TotalControlPoints>,
-                          PaPatchList<TotalControlPoints>,
-                          PaPatchListSingle<TotalControlPoints>,
-                          0,
-                          PA_STATE_OPT::SIMD_WIDTH,
-                          true);
-
-    return true;
-}
-
-#endif
-#define PA_PATCH_LIST_TERMINATOR(N)                                              \
-    template <>                                                                  \
-    bool PaPatchList<N, N>(PA_STATE_OPT & pa, uint32_t slot, simdvector verts[]) \
-    {                                                                            \
-        return PaPatchListTerm<N>(pa, slot, verts);                              \
-    }
-PA_PATCH_LIST_TERMINATOR(1)
-PA_PATCH_LIST_TERMINATOR(2)
-PA_PATCH_LIST_TERMINATOR(3)
-PA_PATCH_LIST_TERMINATOR(4)
-PA_PATCH_LIST_TERMINATOR(5)
-PA_PATCH_LIST_TERMINATOR(6)
-PA_PATCH_LIST_TERMINATOR(7)
-PA_PATCH_LIST_TERMINATOR(8)
-PA_PATCH_LIST_TERMINATOR(9)
-PA_PATCH_LIST_TERMINATOR(10)
-PA_PATCH_LIST_TERMINATOR(11)
-PA_PATCH_LIST_TERMINATOR(12)
-PA_PATCH_LIST_TERMINATOR(13)
-PA_PATCH_LIST_TERMINATOR(14)
-PA_PATCH_LIST_TERMINATOR(15)
-PA_PATCH_LIST_TERMINATOR(16)
-PA_PATCH_LIST_TERMINATOR(17)
-PA_PATCH_LIST_TERMINATOR(18)
-PA_PATCH_LIST_TERMINATOR(19)
-PA_PATCH_LIST_TERMINATOR(20)
-PA_PATCH_LIST_TERMINATOR(21)
-PA_PATCH_LIST_TERMINATOR(22)
-PA_PATCH_LIST_TERMINATOR(23)
-PA_PATCH_LIST_TERMINATOR(24)
-PA_PATCH_LIST_TERMINATOR(25)
-PA_PATCH_LIST_TERMINATOR(26)
-PA_PATCH_LIST_TERMINATOR(27)
-PA_PATCH_LIST_TERMINATOR(28)
-PA_PATCH_LIST_TERMINATOR(29)
-PA_PATCH_LIST_TERMINATOR(30)
-PA_PATCH_LIST_TERMINATOR(31)
-PA_PATCH_LIST_TERMINATOR(32)
-#undef PA_PATCH_LIST_TERMINATOR
-
-#if ENABLE_AVX512_SIMD16
-#define PA_PATCH_LIST_TERMINATOR_SIMD16(N)                                                \
-    template <>                                                                           \
-    bool PaPatchList_simd16<N, N>(PA_STATE_OPT & pa, uint32_t slot, simd16vector verts[]) \
-    {                                                                                     \
-        return PaPatchListTerm_simd16<N>(pa, slot, verts);                                \
-    }
-PA_PATCH_LIST_TERMINATOR_SIMD16(1)
-PA_PATCH_LIST_TERMINATOR_SIMD16(2)
-PA_PATCH_LIST_TERMINATOR_SIMD16(3)
-PA_PATCH_LIST_TERMINATOR_SIMD16(4)
-PA_PATCH_LIST_TERMINATOR_SIMD16(5)
-PA_PATCH_LIST_TERMINATOR_SIMD16(6)
-PA_PATCH_LIST_TERMINATOR_SIMD16(7)
-PA_PATCH_LIST_TERMINATOR_SIMD16(8)
-PA_PATCH_LIST_TERMINATOR_SIMD16(9)
-PA_PATCH_LIST_TERMINATOR_SIMD16(10)
-PA_PATCH_LIST_TERMINATOR_SIMD16(11)
-PA_PATCH_LIST_TERMINATOR_SIMD16(12)
-PA_PATCH_LIST_TERMINATOR_SIMD16(13)
-PA_PATCH_LIST_TERMINATOR_SIMD16(14)
-PA_PATCH_LIST_TERMINATOR_SIMD16(15)
-PA_PATCH_LIST_TERMINATOR_SIMD16(16)
-PA_PATCH_LIST_TERMINATOR_SIMD16(17)
-PA_PATCH_LIST_TERMINATOR_SIMD16(18)
-PA_PATCH_LIST_TERMINATOR_SIMD16(19)
-PA_PATCH_LIST_TERMINATOR_SIMD16(20)
-PA_PATCH_LIST_TERMINATOR_SIMD16(21)
-PA_PATCH_LIST_TERMINATOR_SIMD16(22)
-PA_PATCH_LIST_TERMINATOR_SIMD16(23)
-PA_PATCH_LIST_TERMINATOR_SIMD16(24)
-PA_PATCH_LIST_TERMINATOR_SIMD16(25)
-PA_PATCH_LIST_TERMINATOR_SIMD16(26)
-PA_PATCH_LIST_TERMINATOR_SIMD16(27)
-PA_PATCH_LIST_TERMINATOR_SIMD16(28)
-PA_PATCH_LIST_TERMINATOR_SIMD16(29)
-PA_PATCH_LIST_TERMINATOR_SIMD16(30)
-PA_PATCH_LIST_TERMINATOR_SIMD16(31)
-PA_PATCH_LIST_TERMINATOR_SIMD16(32)
-#undef PA_PATCH_LIST_TERMINATOR_SIMD16
-
-#endif
-bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SetNextPaState(pa, PaTriList1, PaTriListSingle0);
-    return false; // Not enough vertices to assemble 4 or 8 triangles.
-}
-
-bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SetNextPaState(pa, PaTriList2, PaTriListSingle0);
-    return false; // Not enough vertices to assemble 8 triangles.
-}
-
-bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-#if KNOB_ARCH == KNOB_ARCH_AVX
-#if USE_SIMD16_FRONTEND
-    simdvector a;
-    simdvector b;
-    simdvector c;
-
-    if (!pa.useAlternateOffset)
-    {
-        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-            b[i] = _simd16_extract_ps(a_16[i], 1);
-            c[i] = _simd16_extract_ps(b_16[i], 0);
-        }
-    }
-    else
-    {
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
-        const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(b_16[i], 1);
-            b[i] = _simd16_extract_ps(c_16[i], 0);
-            c[i] = _simd16_extract_ps(c_16[i], 1);
-        }
-    }
-
-#else
-    simdvector& a = PaGetSimdVector(pa, 0, slot);
-    simdvector& b = PaGetSimdVector(pa, 1, slot);
-    simdvector& c = PaGetSimdVector(pa, 2, slot);
-
-#endif
-    simdscalar s;
-
-    // Tri Pattern - provoking vertex is always v0
-    //  v0 -> 0 3 6 9  12 15 18 21
-    //  v1 -> 1 4 7 10 13 16 19 22
-    //  v2 -> 2 5 8 11 14 17 20 23
-
-    for (int i = 0; i < 4; ++i)
-    {
-        simdvector& v0 = verts[0];
-        v0[i]          = _simd_blend_ps(a[i], b[i], 0x92);
-        v0[i]          = _simd_blend_ps(v0[i], c[i], 0x24);
-        v0[i]          = _simd_permute_ps_i(v0[i], 0x6C);
-        s              = _simd_permute2f128_ps(v0[i], v0[i], 0x21);
-        v0[i]          = _simd_blend_ps(v0[i], s, 0x44);
-
-        simdvector& v1 = verts[1];
-        v1[i]          = _simd_blend_ps(a[i], b[i], 0x24);
-        v1[i]          = _simd_blend_ps(v1[i], c[i], 0x49);
-        v1[i]          = _simd_permute_ps_i(v1[i], 0xB1);
-        s              = _simd_permute2f128_ps(v1[i], v1[i], 0x21);
-        v1[i]          = _simd_blend_ps(v1[i], s, 0x66);
-
-        simdvector& v2 = verts[2];
-        v2[i]          = _simd_blend_ps(a[i], b[i], 0x49);
-        v2[i]          = _simd_blend_ps(v2[i], c[i], 0x92);
-        v2[i]          = _simd_permute_ps_i(v2[i], 0xC6);
-        s              = _simd_permute2f128_ps(v2[i], v2[i], 0x21);
-        v2[i]          = _simd_blend_ps(v2[i], s, 0x22);
-    }
-
-#elif KNOB_ARCH >= KNOB_ARCH_AVX2
-    const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
-    const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
-    const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
-
-#if USE_SIMD16_FRONTEND
-    simdvector a;
-    simdvector b;
-    simdvector c;
-
-    if (!pa.useAlternateOffset)
-    {
-        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-            b[i] = _simd16_extract_ps(a_16[i], 1);
-            c[i] = _simd16_extract_ps(b_16[i], 0);
-        }
-    }
-    else
-    {
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
-        const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(b_16[i], 1);
-            b[i] = _simd16_extract_ps(c_16[i], 0);
-            c[i] = _simd16_extract_ps(c_16[i], 1);
-        }
-    }
-
-#else
-    const simdvector& a = PaGetSimdVector(pa, 0, slot);
-    const simdvector& b = PaGetSimdVector(pa, 1, slot);
-    const simdvector& c = PaGetSimdVector(pa, 2, slot);
-
-#endif
-    //  v0 -> a0 a3 a6 b1 b4 b7 c2 c5
-    //  v1 -> a1 a4 a7 b2 b5 c0 c3 c6
-    //  v2 -> a2 a5 b0 b3 b6 c1 c4 c7
-
-    simdvector& v0 = verts[0];
-    simdvector& v1 = verts[1];
-    simdvector& v2 = verts[2];
-
-    // for simd x, y, z, and w
-    for (int i = 0; i < 4; ++i)
-    {
-        simdscalar temp0 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
-        simdscalar temp1 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
-        simdscalar temp2 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
-
-        v0[i] = _simd_permute_ps(temp0, perm0);
-        v1[i] = _simd_permute_ps(temp1, perm1);
-        v2[i] = _simd_permute_ps(temp2, perm2);
-    }
-
-#endif
-    SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-}
-
-#if ENABLE_AVX512_SIMD16
-bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0);
-    return false; // Not enough vertices to assemble 16 triangles
-}
-
-bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0);
-    return false; // Not enough vertices to assemble 16 triangles
-}
-
-bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    // clang-format off
-
-#if KNOB_ARCH >= KNOB_ARCH_AVX2
-    const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11,  8, 5, 2, 15, 12,  9, 6, 3, 0);
-    const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12,  9, 6, 3,  0, 13, 10, 7, 4, 1);
-    const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3,  0, 13, 10, 7, 4,  1, 14, 11, 8, 5, 2);
-#else // KNOB_ARCH == KNOB_ARCH_AVX
-    simd16scalar perm0 = _simd16_setzero_ps();
-    simd16scalar perm1 = _simd16_setzero_ps();
-    simd16scalar perm2 = _simd16_setzero_ps();
-#endif
-
-    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
-    const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot);
-
-    const simd16mask mask0 = 0x4924;
-    const simd16mask mask1 = 0x2492;
-    const simd16mask mask2 = 0x9249;
-
-    //  v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
-    //  v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
-    //  v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
-
-    simd16vector& v0 = verts[0];
-    simd16vector& v1 = verts[1];
-    simd16vector& v2 = verts[2];
-
-    // for simd16 x, y, z, and w
-    for (int i = 0; i < 4; i += 1)
-    {
-        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
-        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
-        simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i]));
-
-        simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask0), tempc, mask1);
-        simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask2), tempc, mask0);
-        simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask1), tempc, mask2);
-
-#if KNOB_ARCH >= KNOB_ARCH_AVX2
-        v0[i] = _simd16_permute_ps(temp0, perm0);
-        v1[i] = _simd16_permute_ps(temp1, perm1);
-        v2[i] = _simd16_permute_ps(temp2, perm2);
-#else // #if KNOB_ARCH == KNOB_ARCH_AVX
-
-        // the general permutes (above) are prohibitively slow to emulate on AVX (its scalar code)
-
-        temp0 = _simd16_permute_ps_i(temp0, 0x6C);           // (0, 3, 2, 1) => 00 11 01 10 => 0x6C
-        perm0 = _simd16_permute2f128_ps(temp0, temp0, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
-        temp0 = _simd16_blend_ps(temp0, perm0, 0x4444);      // 0010 0010 0010 0010
-        perm0 = _simd16_permute2f128_ps(temp0, temp0, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
-        v0[i] = _simd16_blend_ps(temp0, perm0, 0x3838);      // 0001 1100 0001 1100
-
-        temp1 = _simd16_permute_ps_i(temp1, 0xB1);           // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
-        perm1 = _simd16_permute2f128_ps(temp1, temp1, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
-        temp1 = _simd16_blend_ps(temp1, perm1, 0x6666);      // 0010 0010 0010 0010
-        perm1 = _simd16_permute2f128_ps(temp1, temp1, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
-        v1[i] = _simd16_blend_ps(temp1, perm1, 0x1818);      // 0001 1000 0001 1000
-
-        temp2 = _simd16_permute_ps_i(temp2, 0xC6);           // (2, 1, 0, 3) => 01 10 00 11 => 0xC6
-        perm2 = _simd16_permute2f128_ps(temp2, temp2, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
-        temp2 = _simd16_blend_ps(temp2, perm2, 0x2222);      // 0100 0100 0100 0100
-        perm2 = _simd16_permute2f128_ps(temp2, temp2, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
-        v2[i] = _simd16_blend_ps(temp2, perm2, 0x1C1C);      // 0011 1000 0011 1000
-#endif
-    }
-
-    SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-
-    // clang-format on
-}
-
-#endif
-void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-{
-#if USE_SIMD16_FRONTEND
-    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
-    const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot);
-
-    if (pa.useAlternateOffset)
-    {
-        primIndex += KNOB_SIMD_WIDTH;
-    }
-
-    //  v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
-    //  v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
-    //  v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
-
-    switch (primIndex)
-    {
-    case 0:
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        verts[2] = swizzleLane2(a);
-        break;
-    case 1:
-        verts[0] = swizzleLane3(a);
-        verts[1] = swizzleLane4(a);
-        verts[2] = swizzleLane5(a);
-        break;
-    case 2:
-        verts[0] = swizzleLane6(a);
-        verts[1] = swizzleLane7(a);
-        verts[2] = swizzleLane8(a);
-        break;
-    case 3:
-        verts[0] = swizzleLane9(a);
-        verts[1] = swizzleLaneA(a);
-        verts[2] = swizzleLaneB(a);
-        break;
-    case 4:
-        verts[0] = swizzleLaneC(a);
-        verts[1] = swizzleLaneD(a);
-        verts[2] = swizzleLaneE(a);
-        break;
-    case 5:
-        verts[0] = swizzleLaneF(a);
-        verts[1] = swizzleLane0(b);
-        verts[2] = swizzleLane1(b);
-        break;
-    case 6:
-        verts[0] = swizzleLane2(b);
-        verts[1] = swizzleLane3(b);
-        verts[2] = swizzleLane4(b);
-        break;
-    case 7:
-        verts[0] = swizzleLane5(b);
-        verts[1] = swizzleLane6(b);
-        verts[2] = swizzleLane7(b);
-        break;
-    case 8:
-        verts[0] = swizzleLane8(b);
-        verts[1] = swizzleLane9(b);
-        verts[2] = swizzleLaneA(b);
-        break;
-    case 9:
-        verts[0] = swizzleLaneB(b);
-        verts[1] = swizzleLaneC(b);
-        verts[2] = swizzleLaneD(b);
-        break;
-    case 10:
-        verts[0] = swizzleLaneE(b);
-        verts[1] = swizzleLaneF(b);
-        verts[2] = swizzleLane0(c);
-        break;
-    case 11:
-        verts[0] = swizzleLane1(c);
-        verts[1] = swizzleLane2(c);
-        verts[2] = swizzleLane3(c);
-        break;
-    case 12:
-        verts[0] = swizzleLane4(c);
-        verts[1] = swizzleLane5(c);
-        verts[2] = swizzleLane6(c);
-        break;
-    case 13:
-        verts[0] = swizzleLane7(c);
-        verts[1] = swizzleLane8(c);
-        verts[2] = swizzleLane9(c);
-        break;
-    case 14:
-        verts[0] = swizzleLaneA(c);
-        verts[1] = swizzleLaneB(c);
-        verts[2] = swizzleLaneC(c);
-        break;
-    case 15:
-        verts[0] = swizzleLaneD(c);
-        verts[1] = swizzleLaneE(c);
-        verts[2] = swizzleLaneF(c);
-        break;
-    };
-#else
-    // We have 12 simdscalars contained within 3 simdvectors which
-    // hold at least 8 triangles worth of data. We want to assemble a single
-    // triangle with data in horizontal form.
-
-    const simdvector& a = PaGetSimdVector(pa, 0, slot);
-    const simdvector& b = PaGetSimdVector(pa, 1, slot);
-    const simdvector& c = PaGetSimdVector(pa, 2, slot);
-
-    // Convert from vertical to horizontal.
-    // Tri Pattern - provoking vertex is always v0
-    //  v0 -> 0 3 6 9  12 15 18 21
-    //  v1 -> 1 4 7 10 13 16 19 22
-    //  v2 -> 2 5 8 11 14 17 20 23
-
-    switch (primIndex)
-    {
-    case 0:
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        verts[2] = swizzleLane2(a);
-        break;
-    case 1:
-        verts[0] = swizzleLane3(a);
-        verts[1] = swizzleLane4(a);
-        verts[2] = swizzleLane5(a);
-        break;
-    case 2:
-        verts[0] = swizzleLane6(a);
-        verts[1] = swizzleLane7(a);
-        verts[2] = swizzleLane0(b);
-        break;
-    case 3:
-        verts[0] = swizzleLane1(b);
-        verts[1] = swizzleLane2(b);
-        verts[2] = swizzleLane3(b);
-        break;
-    case 4:
-        verts[0] = swizzleLane4(b);
-        verts[1] = swizzleLane5(b);
-        verts[2] = swizzleLane6(b);
-        break;
-    case 5:
-        verts[0] = swizzleLane7(b);
-        verts[1] = swizzleLane0(c);
-        verts[2] = swizzleLane1(c);
-        break;
-    case 6:
-        verts[0] = swizzleLane2(c);
-        verts[1] = swizzleLane3(c);
-        verts[2] = swizzleLane4(c);
-        break;
-    case 7:
-        verts[0] = swizzleLane5(c);
-        verts[1] = swizzleLane6(c);
-        verts[2] = swizzleLane7(c);
-        break;
-    };
-#endif
-}
-
-bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
-    return false; // Not enough vertices to assemble 8 triangles.
-}
-
-bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-#if USE_SIMD16_FRONTEND
-    simdvector a;
-    simdvector b;
-
-    if (!pa.useAlternateOffset)
-    {
-        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-            b[i] = _simd16_extract_ps(a_16[i], 1);
-        }
-    }
-    else
-    {
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(b_16[i], 0);
-            b[i] = _simd16_extract_ps(b_16[i], 1);
-        }
-    }
-
-#else
-    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-
-#endif
-    simdscalar s;
-
-    for (int i = 0; i < 4; ++i)
-    {
-        simdscalar a0 = a[i];
-        simdscalar b0 = b[i];
-
-        // Tri Pattern - provoking vertex is always v0
-        //  v0 -> 01234567
-        //  v1 -> 13355779
-        //  v2 -> 22446688
-        simdvector& v0 = verts[0];
-        v0[i]          = a0;
-
-        //  s -> 4567891011
-        s = _simd_permute2f128_ps(a0, b0, 0x21);
-        //  s -> 23456789
-        s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
-
-        simdvector& v1 = verts[1];
-        //  v1 -> 13355779
-        v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
-
-        simdvector& v2 = verts[2];
-        //  v2 -> 22446688
-        v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
-    }
-
-    SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
-    return true;
-}
-
-#if ENABLE_AVX512_SIMD16
-bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0);
-    return false; // Not enough vertices to assemble 16 triangles.
-}
-
-bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    // clang-format off
-
-    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
-
-    const simd16mask mask0 = 0xF000;
-
-    //  v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
-    //  v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
-    //  v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
-
-    simd16vector& v0 = verts[0];
-    simd16vector& v1 = verts[1];
-    simd16vector& v2 = verts[2];
-
-    // for simd16 x, y, z, and w
-    for (int i = 0; i < 4; i += 1)
-    {
-        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
-        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
-
-        simd16scalar perm0 = _simd16_permute2f128_ps(tempa, tempa, 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
-        simd16scalar perm1 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
-
-        simd16scalar blend = _simd16_blend_ps(perm0, perm1, mask0);                                  // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
-        simd16scalar shuff = _simd16_shuffle_ps(tempa, blend, _MM_SHUFFLE(1, 0, 3, 2));              // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
-
-        v0[i] = tempa;                                                                               // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
-        v1[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(3, 1, 3, 1));                           // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
-        v2[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(2, 2, 2, 2));                           // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
-    }
-
-    SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
-    return true;
-
-    // clang-format on
-}
-
-#endif
-void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-{
-#if USE_SIMD16_FRONTEND
-    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
-
-    if (pa.useAlternateOffset)
-    {
-        primIndex += KNOB_SIMD_WIDTH;
-    }
-
-    //  v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
-    //  v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
-    //  v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
-
-    switch (primIndex)
-    {
-    case 0:
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        verts[2] = swizzleLane2(a);
-        break;
-    case 1:
-        verts[0] = swizzleLane1(a);
-        verts[1] = swizzleLane3(a);
-        verts[2] = swizzleLane2(a);
-        break;
-    case 2:
-        verts[0] = swizzleLane2(a);
-        verts[1] = swizzleLane3(a);
-        verts[2] = swizzleLane4(a);
-        break;
-    case 3:
-        verts[0] = swizzleLane3(a);
-        verts[1] = swizzleLane5(a);
-        verts[2] = swizzleLane4(a);
-        break;
-    case 4:
-        verts[0] = swizzleLane4(a);
-        verts[1] = swizzleLane5(a);
-        verts[2] = swizzleLane6(a);
-        break;
-    case 5:
-        verts[0] = swizzleLane5(a);
-        verts[1] = swizzleLane7(a);
-        verts[2] = swizzleLane6(a);
-        break;
-    case 6:
-        verts[0] = swizzleLane6(a);
-        verts[1] = swizzleLane7(a);
-        verts[2] = swizzleLane8(a);
-        break;
-    case 7:
-        verts[0] = swizzleLane7(a);
-        verts[1] = swizzleLane9(a);
-        verts[2] = swizzleLane8(a);
-        break;
-    case 8:
-        verts[0] = swizzleLane8(a);
-        verts[1] = swizzleLane9(a);
-        verts[2] = swizzleLaneA(a);
-        break;
-    case 9:
-        verts[0] = swizzleLane9(a);
-        verts[1] = swizzleLaneB(a);
-        verts[2] = swizzleLaneA(a);
-        break;
-    case 10:
-        verts[0] = swizzleLaneA(a);
-        verts[1] = swizzleLaneB(a);
-        verts[2] = swizzleLaneC(a);
-        break;
-    case 11:
-        verts[0] = swizzleLaneB(a);
-        verts[1] = swizzleLaneD(a);
-        verts[2] = swizzleLaneC(a);
-        break;
-    case 12:
-        verts[0] = swizzleLaneC(a);
-        verts[1] = swizzleLaneD(a);
-        verts[2] = swizzleLaneE(a);
-        break;
-    case 13:
-        verts[0] = swizzleLaneD(a);
-        verts[1] = swizzleLaneF(a);
-        verts[2] = swizzleLaneE(a);
-        break;
-    case 14:
-        verts[0] = swizzleLaneE(a);
-        verts[1] = swizzleLaneF(a);
-        verts[2] = swizzleLane0(b);
-        break;
-    case 15:
-        verts[0] = swizzleLaneF(a);
-        verts[1] = swizzleLane1(b);
-        verts[2] = swizzleLane0(b);
-        break;
-    };
-#else
-    const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-    const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-
-    // Convert from vertical to horizontal.
-    // Tri Pattern - provoking vertex is always v0
-    //  v0 -> 01234567
-    //  v1 -> 13355779
-    //  v2 -> 22446688
-
-    switch (primIndex)
-    {
-    case 0:
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        verts[2] = swizzleLane2(a);
-        break;
-    case 1:
-        verts[0] = swizzleLane1(a);
-        verts[1] = swizzleLane3(a);
-        verts[2] = swizzleLane2(a);
-        break;
-    case 2:
-        verts[0] = swizzleLane2(a);
-        verts[1] = swizzleLane3(a);
-        verts[2] = swizzleLane4(a);
-        break;
-    case 3:
-        verts[0] = swizzleLane3(a);
-        verts[1] = swizzleLane5(a);
-        verts[2] = swizzleLane4(a);
-        break;
-    case 4:
-        verts[0] = swizzleLane4(a);
-        verts[1] = swizzleLane5(a);
-        verts[2] = swizzleLane6(a);
-        break;
-    case 5:
-        verts[0] = swizzleLane5(a);
-        verts[1] = swizzleLane7(a);
-        verts[2] = swizzleLane6(a);
-        break;
-    case 6:
-        verts[0] = swizzleLane6(a);
-        verts[1] = swizzleLane7(a);
-        verts[2] = swizzleLane0(b);
-        break;
-    case 7:
-        verts[0] = swizzleLane7(a);
-        verts[1] = swizzleLane1(b);
-        verts[2] = swizzleLane0(b);
-        break;
-    };
-#endif
-}
-
-bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
-    return false; // Not enough vertices to assemble 8 triangles.
-}
-
-bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-#if USE_SIMD16_FRONTEND
-    simdvector leadVert;
-    simdvector a;
-    simdvector b;
-
-    const simd16vector& leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
-
-    if (!pa.useAlternateOffset)
-    {
-        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
-
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-            b[i] = _simd16_extract_ps(a_16[i], 1);
-        }
-    }
-    else
-    {
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
-
-            a[i] = _simd16_extract_ps(b_16[i], 0);
-            b[i] = _simd16_extract_ps(b_16[i], 1);
-        }
-    }
-
-#else
-    const simdvector& leadVert = PaGetSimdVector(pa, pa.first, slot);
-    const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-    const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-
-#endif
-    simdscalar s;
-
-    // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
-    for (int i = 0; i < 4; ++i)
-    {
-        simdscalar a0 = a[i];
-        simdscalar b0 = b[i];
-
-        simdscalar comp = leadVert[i];
-
-        simdvector& v0 = verts[0];
-        v0[i]          = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
-        v0[i]          = _simd_permute2f128_ps(v0[i], comp, 0x00);
-
-        simdvector& v2 = verts[2];
-        s              = _simd_permute2f128_ps(a0, b0, 0x21);
-        v2[i]          = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
-
-        simdvector& v1 = verts[1];
-        v1[i]          = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
-    }
-
-    SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
-    return true;
-}
-
-#if ENABLE_AVX512_SIMD16
-bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0);
-    return false; // Not enough vertices to assemble 16 triangles.
-}
-
-bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    // clang-format off
-
-    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot);
-
-    const simd16mask mask0 = 0xF000;
-
-    //  v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
-    //  v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
-    //  v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
-
-    simd16vector& v0 = verts[0];
-    simd16vector& v1 = verts[1];
-    simd16vector& v2 = verts[2];
-
-    // for simd16 x, y, z, and w
-    for (uint32_t i = 0; i < 4; i += 1)
-    {
-        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
-        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
-        simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i]));
-
-        simd16scalar shuff = _simd16_shuffle_ps(tempa, tempa, _MM_SHUFFLE(0, 0, 0, 0));              // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4
-
-        v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00);                                         // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
-
-        simd16scalar temp0 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
-        simd16scalar temp1 = _simd16_permute2f128_ps(tempc, tempc, 0x39); // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3
-
-        simd16scalar blend = _simd16_blend_ps(temp0, temp1, mask0);                                  // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3
-
-        simd16scalar temp2 = _simd16_shuffle_ps(tempb, blend, _MM_SHUFFLE(1, 0, 3, 2));              // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
-
-        v1[i] = _simd16_shuffle_ps(tempb, temp2, _MM_SHUFFLE(2, 1, 2, 1));                           // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
-        v2[i] = temp2;                                                                               // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
-    }
-
-    SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
-    return true;
-
-    // clang-format on
-}
-
-#endif
-void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-{
-#if USE_SIMD16_FRONTEND
-    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot);
-
-    if (pa.useAlternateOffset)
-    {
-        primIndex += KNOB_SIMD_WIDTH;
-    }
-
-    //  v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
-    //  v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
-    //  v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
-
-    // vert 0 from leading vertex
-    verts[0] = swizzleLane0(a);
-
-    // vert 1
-    if (primIndex < 15)
-    {
-        verts[1] = swizzleLaneN(b, primIndex + 1);
-    }
-    else
-    {
-        verts[1] = swizzleLane0(c);
-    }
-
-    // vert 2
-    if (primIndex < 14)
-    {
-        verts[2] = swizzleLaneN(b, primIndex + 2);
-    }
-    else
-    {
-        verts[2] = swizzleLaneN(c, primIndex - 14);
-    }
-#else
-    const simdvector& a = PaGetSimdVector(pa, pa.first, slot);
-    const simdvector& b = PaGetSimdVector(pa, pa.prev, slot);
-    const simdvector& c = PaGetSimdVector(pa, pa.cur, slot);
-
-    // vert 0 from leading vertex
-    verts[0] = swizzleLane0(a);
-
-    // vert 1
-    if (primIndex < 7)
-    {
-        verts[1] = swizzleLaneN(b, primIndex + 1);
-    }
-    else
-    {
-        verts[1] = swizzleLane0(c);
-    }
-
-    // vert 2
-    if (primIndex < 6)
-    {
-        verts[2] = swizzleLaneN(b, primIndex + 2);
-    }
-    else
-    {
-        verts[2] = swizzleLaneN(c, primIndex - 6);
-    }
-#endif
-}
-
-bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
-    return false; // Not enough vertices to assemble 8 triangles.
-}
-
-bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-#if USE_SIMD16_FRONTEND
-    simdvector a;
-    simdvector b;
-
-    if (!pa.useAlternateOffset)
-    {
-        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-            b[i] = _simd16_extract_ps(a_16[i], 1);
-        }
-    }
-    else
-    {
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(b_16[i], 0);
-            b[i] = _simd16_extract_ps(b_16[i], 1);
-        }
-    }
-
-#else
-    simdvector& a = PaGetSimdVector(pa, 0, slot);
-    simdvector& b = PaGetSimdVector(pa, 1, slot);
-
-#endif
-    simdscalar s1, s2;
-
-    for (int i = 0; i < 4; ++i)
-    {
-        simdscalar a0 = a[i];
-        simdscalar b0 = b[i];
-
-        s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
-        s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
-
-        simdvector& v0 = verts[0];
-        v0[i]          = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
-
-        simdvector& v1 = verts[1];
-        v1[i]          = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
-
-        simdvector& v2 = verts[2];
-        v2[i]          = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
-    }
-
-    SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-}
-
-#if ENABLE_AVX512_SIMD16
-bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0);
-    return false; // Not enough vertices to assemble 16 triangles.
-}
-
-bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    // clang-format off
-
-    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
-
-    //  v0 -> a0 a0 a4 a4 a8 a8 aC aC b0 b0 b0 b0 b0 b0 bC bC
-    //  v1 -> a1 a2 a5 a6 a9 aA aD aE b1 b2 b5 b6 b9 bA bD bE
-    //  v2 -> a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
-
-    simd16vector& v0 = verts[0];
-    simd16vector& v1 = verts[1];
-    simd16vector& v2 = verts[2];
-
-    // for simd16 x, y, z, and w
-    for (uint32_t i = 0; i < 4; i += 1)
-    {
-        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
-        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
-
-        simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) = 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB
-        simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) = 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
-
-        v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0));                           // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC
-        v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1));                           // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE
-        v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2));                           // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
-    }
-
-    SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-
-    // clang-format on
-}
-
-#endif
-void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-{
-#if USE_SIMD16_FRONTEND
-    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
-
-    if (pa.useAlternateOffset)
-    {
-        primIndex += KNOB_SIMD_WIDTH;
-    }
-
-    switch (primIndex)
-    {
-    case 0:
-        // triangle 0 - 0 1 2
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        verts[2] = swizzleLane2(a);
-        break;
-    case 1:
-        // triangle 1 - 0 2 3
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane2(a);
-        verts[2] = swizzleLane3(a);
-        break;
-    case 2:
-        // triangle 2 - 4 5 6
-        verts[0] = swizzleLane4(a);
-        verts[1] = swizzleLane5(a);
-        verts[2] = swizzleLane6(a);
-        break;
-    case 3:
-        // triangle 3 - 4 6 7
-        verts[0] = swizzleLane4(a);
-        verts[1] = swizzleLane6(a);
-        verts[2] = swizzleLane7(a);
-        break;
-    case 4:
-        // triangle 4 - 8 9 A
-        verts[0] = swizzleLane8(a);
-        verts[1] = swizzleLane9(a);
-        verts[2] = swizzleLaneA(a);
-        break;
-    case 5:
-        // triangle 5 - 8 A B
-        verts[0] = swizzleLane8(a);
-        verts[1] = swizzleLaneA(a);
-        verts[2] = swizzleLaneB(a);
-        break;
-    case 6:
-        // triangle 6 - C D E
-        verts[0] = swizzleLaneC(a);
-        verts[1] = swizzleLaneD(a);
-        verts[2] = swizzleLaneE(a);
-        break;
-    case 7:
-        // triangle 7 - C E F
-        verts[0] = swizzleLaneC(a);
-        verts[1] = swizzleLaneE(a);
-        verts[2] = swizzleLaneF(a);
-        break;
-    case 8:
-        // triangle 0 - 0 1 2
-        verts[0] = swizzleLane0(b);
-        verts[1] = swizzleLane1(b);
-        verts[2] = swizzleLane2(b);
-        break;
-    case 9:
-        // triangle 1 - 0 2 3
-        verts[0] = swizzleLane0(b);
-        verts[1] = swizzleLane2(b);
-        verts[2] = swizzleLane3(b);
-        break;
-    case 10:
-        // triangle 2 - 4 5 6
-        verts[0] = swizzleLane4(b);
-        verts[1] = swizzleLane5(b);
-        verts[2] = swizzleLane6(b);
-        break;
-    case 11:
-        // triangle 3 - 4 6 7
-        verts[0] = swizzleLane4(b);
-        verts[1] = swizzleLane6(b);
-        verts[2] = swizzleLane7(b);
-        break;
-    case 12:
-        // triangle 4 - 8 9 A
-        verts[0] = swizzleLane8(b);
-        verts[1] = swizzleLane9(b);
-        verts[2] = swizzleLaneA(b);
-        break;
-    case 13:
-        // triangle 5 - 8 A B
-        verts[0] = swizzleLane8(b);
-        verts[1] = swizzleLaneA(b);
-        verts[2] = swizzleLaneB(b);
-        break;
-    case 14:
-        // triangle 6 - C D E
-        verts[0] = swizzleLaneC(b);
-        verts[1] = swizzleLaneD(b);
-        verts[2] = swizzleLaneE(b);
-        break;
-    case 15:
-        // triangle 7 - C E F
-        verts[0] = swizzleLaneC(b);
-        verts[1] = swizzleLaneE(b);
-        verts[2] = swizzleLaneF(b);
-        break;
-    }
-#else
-    const simdvector& a = PaGetSimdVector(pa, 0, slot);
-    const simdvector& b = PaGetSimdVector(pa, 1, slot);
-
-    switch (primIndex)
-    {
-    case 0:
-        // triangle 0 - 0 1 2
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        verts[2] = swizzleLane2(a);
-        break;
-    case 1:
-        // triangle 1 - 0 2 3
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane2(a);
-        verts[2] = swizzleLane3(a);
-        break;
-    case 2:
-        // triangle 2 - 4 5 6
-        verts[0] = swizzleLane4(a);
-        verts[1] = swizzleLane5(a);
-        verts[2] = swizzleLane6(a);
-        break;
-    case 3:
-        // triangle 3 - 4 6 7
-        verts[0] = swizzleLane4(a);
-        verts[1] = swizzleLane6(a);
-        verts[2] = swizzleLane7(a);
-        break;
-    case 4:
-        // triangle 4 - 8 9 10 (0 1 2)
-        verts[0] = swizzleLane0(b);
-        verts[1] = swizzleLane1(b);
-        verts[2] = swizzleLane2(b);
-        break;
-    case 5:
-        // triangle 1 - 0 2 3
-        verts[0] = swizzleLane0(b);
-        verts[1] = swizzleLane2(b);
-        verts[2] = swizzleLane3(b);
-        break;
-    case 6:
-        // triangle 2 - 4 5 6
-        verts[0] = swizzleLane4(b);
-        verts[1] = swizzleLane5(b);
-        verts[2] = swizzleLane6(b);
-        break;
-    case 7:
-        // triangle 3 - 4 6 7
-        verts[0] = swizzleLane4(b);
-        verts[1] = swizzleLane6(b);
-        verts[2] = swizzleLane7(b);
-        break;
-    }
-#endif
-}
-
-bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
-    return false;
-}
-
-bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    PaLineStrip1(pa, slot, verts);
-
-    if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1)
-    {
-        // loop reconnect now
-        const int lane = pa.numPrims - pa.numPrimsComplete - 1;
-
-#if USE_SIMD16_FRONTEND
-        simdvector first;
-
-        const simd16vector& first_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
-
-        if (!pa.useAlternateOffset)
-        {
-            for (uint32_t i = 0; i < 4; i += 1)
-            {
-                first[i] = _simd16_extract_ps(first_16[i], 0);
-            }
-        }
-        else
-        {
-            for (uint32_t i = 0; i < 4; i += 1)
-            {
-                first[i] = _simd16_extract_ps(first_16[i], 1);
-            }
-        }
-
-#else
-        simdvector& first = PaGetSimdVector(pa, pa.first, slot);
-
-#endif
-        for (int i = 0; i < 4; i++)
-        {
-            float* firstVtx  = (float*)&(first[i]);
-            float* targetVtx = (float*)&(verts[1][i]);
-            targetVtx[lane]  = firstVtx[0];
-        }
-    }
-
-    SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
-    return true;
-}
-
-#if ENABLE_AVX512_SIMD16
-bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0);
-    return false;
-}
-
-bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    PaLineStrip1_simd16(pa, slot, verts);
-
-    if (pa.numPrimsComplete + KNOB_SIMD16_WIDTH > pa.numPrims - 1)
-    {
-        // loop reconnect now
-        const int lane = pa.numPrims - pa.numPrimsComplete - 1;
-
-        const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot);
-
-        for (int i = 0; i < 4; i++)
-        {
-            float* firstVtx  = (float*)&(first[i]);
-            float* targetVtx = (float*)&(verts[1][i]);
-            targetVtx[lane]  = firstVtx[0];
-        }
-    }
-
-    SetNextPaState_simd16(
-        pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
-    return true;
-}
-
-#endif
-void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-{
-    PaLineStripSingle0(pa, slot, primIndex, verts);
-
-    if (pa.numPrimsComplete + primIndex == pa.numPrims - 1)
-    {
-#if USE_SIMD16_FRONTEND
-        const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot);
-
-        verts[1] = swizzleLane0(first);
-#else
-        const simdvector& first = PaGetSimdVector(pa, pa.first, slot);
-
-        verts[1] = swizzleLane0(first);
-#endif
-    }
-}
-
-bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SetNextPaState(pa, PaLineList1, PaLineListSingle0);
-    return false; // Not enough vertices to assemble 8 lines
-}
-
-bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-#if USE_SIMD16_FRONTEND
-    simdvector a;
-    simdvector b;
-
-    if (!pa.useAlternateOffset)
-    {
-        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-            b[i] = _simd16_extract_ps(a_16[i], 1);
-        }
-    }
-    else
-    {
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(b_16[i], 0);
-            b[i] = _simd16_extract_ps(b_16[i], 1);
-        }
-    }
-
-#else
-    simdvector& a = PaGetSimdVector(pa, 0, slot);
-    simdvector& b = PaGetSimdVector(pa, 1, slot);
-
-#endif
-    /// @todo: verify provoking vertex is correct
-    // Line list 0  1  2  3  4  5  6  7
-    //           8  9 10 11 12 13 14 15
-
-    // shuffle:
-    //           0 2 4 6 8 10 12 14
-    //           1 3 5 7 9 11 13 15
-
-    for (uint32_t i = 0; i < 4; ++i)
-    {
-        // 0 1 2 3 8 9 10 11
-        __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
-        // 4 5 6 7 12 13 14 15
-        __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
-
-        // 0 2 4 6 8 10 12 14
-        verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
-        // 1 3 5 7 9 11 13 15
-        verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
-    }
-
-    SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-}
-
-#if ENABLE_AVX512_SIMD16
-bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0);
-    return false; // Not enough vertices to assemble 16 lines
-}
-
-bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    // clang-format off
-
-    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
-
-    // v0 -> a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
-    // v1 -> a1 a3 a5 a7 a9 aB aD aF b1 b3 b4 b7 b9 bB bD bF
-
-    simd16vector& v0 = verts[0];
-    simd16vector& v1 = verts[1];
-
-    // for simd16 x, y, z, and w
-    for (int i = 0; i < 4; i += 1)
-    {
-        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
-        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
-
-        simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) 10 00 10 00   // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB
-        simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) 11 01 11 01   // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
-
-        v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));                           // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
-        v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));                           // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
-    }
-
-    SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-
-    // clang-format on
-}
-
-#endif
-void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-{
-#if USE_SIMD16_FRONTEND
-    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
-
-    if (pa.useAlternateOffset)
-    {
-        primIndex += KNOB_SIMD_WIDTH;
-    }
-
-    switch (primIndex)
-    {
-    case 0:
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        break;
-    case 1:
-        verts[0] = swizzleLane2(a);
-        verts[1] = swizzleLane3(a);
-        break;
-    case 2:
-        verts[0] = swizzleLane4(a);
-        verts[1] = swizzleLane5(a);
-        break;
-    case 3:
-        verts[0] = swizzleLane6(a);
-        verts[1] = swizzleLane7(a);
-        break;
-    case 4:
-        verts[0] = swizzleLane8(a);
-        verts[1] = swizzleLane9(a);
-        break;
-    case 5:
-        verts[0] = swizzleLaneA(a);
-        verts[1] = swizzleLaneB(a);
-        break;
-    case 6:
-        verts[0] = swizzleLaneC(a);
-        verts[1] = swizzleLaneD(a);
-        break;
-    case 7:
-        verts[0] = swizzleLaneE(a);
-        verts[1] = swizzleLaneF(a);
-        break;
-    case 8:
-        verts[0] = swizzleLane0(b);
-        verts[1] = swizzleLane1(b);
-        break;
-    case 9:
-        verts[0] = swizzleLane2(b);
-        verts[1] = swizzleLane3(b);
-        break;
-    case 10:
-        verts[0] = swizzleLane4(b);
-        verts[1] = swizzleLane5(b);
-        break;
-    case 11:
-        verts[0] = swizzleLane6(b);
-        verts[1] = swizzleLane7(b);
-        break;
-    case 12:
-        verts[0] = swizzleLane8(b);
-        verts[1] = swizzleLane9(b);
-        break;
-    case 13:
-        verts[0] = swizzleLaneA(b);
-        verts[1] = swizzleLaneB(b);
-        break;
-    case 14:
-        verts[0] = swizzleLaneC(b);
-        verts[1] = swizzleLaneD(b);
-        break;
-    case 15:
-        verts[0] = swizzleLaneE(b);
-        verts[1] = swizzleLaneF(b);
-        break;
-    }
-#else
-    const simdvector& a = PaGetSimdVector(pa, 0, slot);
-    const simdvector& b = PaGetSimdVector(pa, 1, slot);
-
-    switch (primIndex)
-    {
-    case 0:
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        break;
-    case 1:
-        verts[0] = swizzleLane2(a);
-        verts[1] = swizzleLane3(a);
-        break;
-    case 2:
-        verts[0] = swizzleLane4(a);
-        verts[1] = swizzleLane5(a);
-        break;
-    case 3:
-        verts[0] = swizzleLane6(a);
-        verts[1] = swizzleLane7(a);
-        break;
-    case 4:
-        verts[0] = swizzleLane0(b);
-        verts[1] = swizzleLane1(b);
-        break;
-    case 5:
-        verts[0] = swizzleLane2(b);
-        verts[1] = swizzleLane3(b);
-        break;
-    case 6:
-        verts[0] = swizzleLane4(b);
-        verts[1] = swizzleLane5(b);
-        break;
-    case 7:
-        verts[0] = swizzleLane6(b);
-        verts[1] = swizzleLane7(b);
-        break;
-    }
-#endif
-}
-
-bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
-    return false; // Not enough vertices to assemble 8 lines
-}
-
-bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-#if USE_SIMD16_FRONTEND
-    simdvector a;
-    simdvector b;
-
-    if (!pa.useAlternateOffset)
-    {
-        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-            b[i] = _simd16_extract_ps(a_16[i], 1);
-        }
-    }
-    else
-    {
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(b_16[i], 0);
-            b[i] = _simd16_extract_ps(b_16[i], 1);
-        }
-    }
-
-#else
-    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-
-#endif
-    /// @todo: verify provoking vertex is correct
-    // Line list 0  1  2  3  4  5  6  7
-    //           8  9 10 11 12 13 14 15
-
-    // shuffle:
-    //           0  1  2  3  4  5  6  7
-    //           1  2  3  4  5  6  7  8
-
-    verts[0] = a;
-
-    for (uint32_t i = 0; i < 4; ++i)
-    {
-        // 1 2 3 x 5 6 7 x
-        __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
-        // 4 5 6 7 8 9 10 11
-        __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
-
-        // x x x 4 x x x 8
-        __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low  (0 0 0 0)
-
-        verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
-    }
-
-    SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
-    return true;
-}
-
-#if ENABLE_AVX512_SIMD16
-bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0);
-    return false; // Not enough vertices to assemble 16 lines
-}
-
-bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    // clang-format off
-
-    const simd16scalari perm = _simd16_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-
-    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
-
-    const simd16mask mask0 = 0x0001;
-
-    // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
-    // v1 -> a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
-
-    simd16vector& v0 = verts[0];
-    simd16vector& v1 = verts[1];
-
-    v0 = a; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
-
-    // for simd16 x, y, z, and w
-    for (int i = 0; i < 4; i += 1)
-    {
-        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
-        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
-
-        simd16scalar temp = _simd16_blend_ps(tempa, tempb, mask0); // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
-
-        v1[i] = _simd16_permute_ps(temp, perm);                    // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
-    }
-
-    SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
-    return true;
-
-    // clang-format on
-}
-
-#endif
-void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-{
-#if USE_SIMD16_FRONTEND
-    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
-
-    if (pa.useAlternateOffset)
-    {
-        primIndex += KNOB_SIMD_WIDTH;
-    }
-
-    switch (primIndex)
-    {
-    case 0:
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        break;
-    case 1:
-        verts[0] = swizzleLane1(a);
-        verts[1] = swizzleLane2(a);
-        break;
-    case 2:
-        verts[0] = swizzleLane2(a);
-        verts[1] = swizzleLane3(a);
-        break;
-    case 3:
-        verts[0] = swizzleLane3(a);
-        verts[1] = swizzleLane4(a);
-        break;
-    case 4:
-        verts[0] = swizzleLane4(a);
-        verts[1] = swizzleLane5(a);
-        break;
-    case 5:
-        verts[0] = swizzleLane5(a);
-        verts[1] = swizzleLane6(a);
-        break;
-    case 6:
-        verts[0] = swizzleLane6(a);
-        verts[1] = swizzleLane7(a);
-        break;
-    case 7:
-        verts[0] = swizzleLane7(a);
-        verts[1] = swizzleLane8(a);
-        break;
-    case 8:
-        verts[0] = swizzleLane8(a);
-        verts[1] = swizzleLane9(a);
-        break;
-    case 9:
-        verts[0] = swizzleLane9(a);
-        verts[1] = swizzleLaneA(a);
-        break;
-    case 10:
-        verts[0] = swizzleLaneA(a);
-        verts[1] = swizzleLaneB(a);
-        break;
-    case 11:
-        verts[0] = swizzleLaneB(a);
-        verts[1] = swizzleLaneC(a);
-        break;
-    case 12:
-        verts[0] = swizzleLaneC(a);
-        verts[1] = swizzleLaneD(a);
-        break;
-    case 13:
-        verts[0] = swizzleLaneD(a);
-        verts[1] = swizzleLaneE(a);
-        break;
-    case 14:
-        verts[0] = swizzleLaneE(a);
-        verts[1] = swizzleLaneF(a);
-        break;
-    case 15:
-        verts[0] = swizzleLaneF(a);
-        verts[1] = swizzleLane0(b);
-        break;
-    }
-#else
-    const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-    const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-
-    switch (primIndex)
-    {
-    case 0:
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        break;
-    case 1:
-        verts[0] = swizzleLane1(a);
-        verts[1] = swizzleLane2(a);
-        break;
-    case 2:
-        verts[0] = swizzleLane2(a);
-        verts[1] = swizzleLane3(a);
-        break;
-    case 3:
-        verts[0] = swizzleLane3(a);
-        verts[1] = swizzleLane4(a);
-        break;
-    case 4:
-        verts[0] = swizzleLane4(a);
-        verts[1] = swizzleLane5(a);
-        break;
-    case 5:
-        verts[0] = swizzleLane5(a);
-        verts[1] = swizzleLane6(a);
-        break;
-    case 6:
-        verts[0] = swizzleLane6(a);
-        verts[1] = swizzleLane7(a);
-        break;
-    case 7:
-        verts[0] = swizzleLane7(a);
-        verts[1] = swizzleLane0(b);
-        break;
-    }
-#endif
-}
-
-bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-#if USE_SIMD16_FRONTEND
-    simdvector a;
-
-    const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
-
-    if (!pa.useAlternateOffset)
-    {
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-        }
-    }
-    else
-    {
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 1);
-        }
-    }
-
-#else
-    simdvector& a = PaGetSimdVector(pa, 0, slot);
-
-#endif
-    verts[0] = a; // points only have 1 vertex.
-
-    SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-}
-
-#if ENABLE_AVX512_SIMD16
-bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    simd16vector& a = PaGetSimdVector_simd16(pa, pa.cur, slot);
-
-    verts[0] = a; // points only have 1 vertex.
-
-    SetNextPaState_simd16(
-        pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-}
-
-#endif
-void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-{
-#if USE_SIMD16_FRONTEND
-    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
-
-    if (pa.useAlternateOffset)
-    {
-        primIndex += KNOB_SIMD_WIDTH;
-    }
-
-    verts[0] = swizzleLaneN(a, primIndex);
-#else
-    const simdvector& a = PaGetSimdVector(pa, 0, slot);
-
-    verts[0] = swizzleLaneN(a, primIndex);
-#endif
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief State 1 for RECT_LIST topology.
-///        There is not enough to assemble 8 triangles.
-bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SetNextPaState(pa, PaRectList1, PaRectListSingle0);
-    return false;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief State 1 for RECT_LIST topology.
-///   Rect lists has the following format.
-///             w          x          y           z
-///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
-///         | \ |      | \ |      | \ |       | \ |
-///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
-///            v0         v3         v6          v9
-///
-///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
-///
-///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
-///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
-///   etc.
-///
-///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
-///   where v0 contains all the first vertices for 8 triangles.
-///
-///     Result:
-///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
-///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
-///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
-///
-/// @param pa - State for PA state machine.
-/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
-/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
-/// etc.
-bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-// SIMD vectors a and b are the last two vertical outputs from the vertex shader.
-#if USE_SIMD16_FRONTEND
-    simdvector a;
-    simdvector b;
-
-    if (!pa.useAlternateOffset)
-    {
-        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-            b[i] = _simd16_extract_ps(a_16[i], 1);
-        }
-    }
-    else
-    {
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(b_16[i], 0);
-            b[i] = _simd16_extract_ps(b_16[i], 1);
-            ;
-        }
-    }
-
-#else
-    simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7 }
-    simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
-
-#endif
-    __m256 tmp0, tmp1, tmp2;
-
-    // Loop over each component in the simdvector.
-    for (int i = 0; i < 4; ++i)
-    {
-        simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
-        tmp0           = _mm256_permute2f128_ps(
-            b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
-        v0[i] = _mm256_blend_ps(
-            a[i],
-            tmp0,
-            0x20); //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6, * } where * is don't care.
-        tmp1  = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *, * }
-        v0[i] = _mm256_permute_ps(v0[i], 0x5A); //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
-        v0[i] =
-            _mm256_blend_ps(tmp1, v0[i], 0xF0); //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
-
-        /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
-        ///      AVX2 should make this much cheaper.
-        simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
-        v1[i]          = _mm256_permute_ps(a[i], 0x09);  //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
-        tmp1           = _mm256_permute_ps(a[i], 0x43);  // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
-        tmp2  = _mm256_blend_ps(v1[i], tmp1, 0xF0);      // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
-        tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7,  *, v4,  v5, *, *,  *,  * }
-        v1[i] = _mm256_permute_ps(tmp0, 0xE0);      //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
-        v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
-        v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
-
-        // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
-        simdvector& v2 = verts[2]; // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
-        v2[i]          = _mm256_permute_ps(tmp0, 0x30); //   v2 = { *, *, *, *, v8, *, v11, * }
-        tmp1           = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
-        v2[i]          = _mm256_blend_ps(tmp1, v2[i], 0xF0);
-
-        // Need to compute 4th implied vertex for the rectangle.
-        tmp2  = _mm256_sub_ps(v0[i], v1[i]);
-        tmp2  = _mm256_add_ps(tmp2, v2[i]);         // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
-        tmp2  = _mm256_permute_ps(tmp2, 0xA0);      // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
-        v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
-    }
-
-    SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief State 2 for RECT_LIST topology.
-///        Not implemented unless there is a use case for more then 8 rects.
-/// @param pa - State for PA state machine.
-/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
-/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
-/// etc.
-bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-{
-    SWR_INVALID("Is rect list used for anything other then clears?");
-    SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-}
-
-#if ENABLE_AVX512_SIMD16
-//////////////////////////////////////////////////////////////////////////
-/// @brief State 1 for RECT_LIST topology.
-///        There is not enough to assemble 8 triangles.
-bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0);
-    return false;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief State 1 for RECT_LIST topology.
-///   Rect lists has the following format.
-///             w          x          y           z
-///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
-///         | \ |      | \ |      | \ |       | \ |
-///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
-///            v0         v3         v6          v9
-///
-///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
-///
-///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
-///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
-///   etc.
-///
-///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
-///   where v0 contains all the first vertices for 8 triangles.
-///
-///     Result:
-///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
-///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
-///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
-///
-/// @param pa - State for PA state machine.
-/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
-/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
-/// etc.
-bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    // clang-format off
-
-    simdvector a;
-    simdvector b;
-
-    if (!pa.useAlternateOffset)
-    {
-        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7,
-                                                                        //         v8, v9, v10, v11, v12, v13, v14, v15 }
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-            b[i] = _simd16_extract_ps(a_16[i], 1);
-        }
-    }
-    else
-    {
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. }
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(b_16[i], 0);
-            b[i] = _simd16_extract_ps(b_16[i], 1);
-        }
-    }
-
-    simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6,  v9,  v9 }
-    simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
-    simd16vector& v2 = verts[2]; // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11,   z }
-
-    // Loop over each component in the simdvector.
-    for (int i = 0; i < 4; i += 1)
-    {
-        simdscalar v0_lo; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
-        simdscalar v1_lo; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
-        simdscalar v2_lo; // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
-
-        __m256 tmp0, tmp1, tmp2;
-
-        tmp0  = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
-        v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20);        //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,   * } where * is don't care.
-        tmp1  = _mm256_permute_ps(v0_lo, 0xF0);           // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,   *,   * }
-        v0_lo = _mm256_permute_ps(v0_lo, 0x5A);           //   v0 = {   *,   *,   *,   *,  v6, v6, v9,  v9 }
-        v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0);       //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9,  v9 }
-
-        /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
-        ///      AVX2 should make this much cheaper.
-        v1_lo = _mm256_permute_ps(a[i], 0x09);            //   v1 = { v1, v2,  *,  *,  *,  *,   *,   * }
-        tmp1  = _mm256_permute_ps(a[i], 0x43);            // tmp1 = {  *,  *,  *,  *, v7,  *,  v4,  v5 }
-        tmp2  = _mm256_blend_ps(v1_lo, tmp1, 0xF0);       // tmp2 = { v1, v2,  *,  *, v7,  *,  v4,  v5 }
-        tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);  // tmp1 = { v7,  *, v4,  v5, *,  *,   *,   * }
-        v1_lo = _mm256_permute_ps(tmp0, 0xE0);            //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
-        v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0);       //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
-        v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C);       //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
-
-        // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
-        v2_lo = _mm256_permute_ps(tmp0, 0x30);            //   v2 = { *,  *,  *, *, v8, *, v11, * }
-        tmp1  = _mm256_permute_ps(tmp2, 0x31);            // tmp1 = { v2, *, v5, *,  *, *,   *, * }
-        v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0);
-
-        // Need to compute 4th implied vertex for the rectangle.
-        tmp2  = _mm256_sub_ps(v0_lo, v1_lo);
-        tmp2  = _mm256_add_ps(tmp2, v2_lo);               // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
-        tmp2  = _mm256_permute_ps(tmp2, 0xA0);            // tmp2 = {  *,  w,  *, x, *,  y,  *,  z }
-        v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA);       //   v2 = { v2,  w, v5, x, v8, y, v11, z }
-
-        v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0);
-        v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0);
-        v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0);
-    }
-
-    SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-
-    // clang-format on
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief State 2 for RECT_LIST topology.
-///        Not implemented unless there is a use case for more then 8 rects.
-/// @param pa - State for PA state machine.
-/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
-/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
-/// etc.
-bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
-{
-    SWR_INVALID("Is rect list used for anything other then clears?");
-    SetNextPaState_simd16(
-        pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
-    return true;
-}
-
-#endif
-//////////////////////////////////////////////////////////////////////////
-/// @brief This procedure is called by the Binner to assemble the attributes.
-///        Unlike position, which is stored vertically, the attributes are
-///        stored horizontally. The outputs from the VS, labeled as 'a' and
-///        'b' are vertical. This function needs to transpose the lanes
-///        containing the vertical attribute data into horizontal form.
-/// @param pa - State for PA state machine.
-/// @param slot - Index into VS output for a given attribute.
-/// @param primIndex - Binner processes each triangle individually.
-/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
-/// etc.
-void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
-{
-// We have 12 simdscalars contained within 3 simdvectors which
-// hold at least 8 triangles worth of data. We want to assemble a single
-// triangle with data in horizontal form.
-#if USE_SIMD16_FRONTEND
-    simdvector a;
-    simdvector b;
-
-    if (!pa.useAlternateOffset)
-    {
-        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(a_16[i], 0);
-            b[i] = _simd16_extract_ps(a_16[i], 1);
-        }
-    }
-    else
-    {
-        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
-
-        for (uint32_t i = 0; i < 4; i += 1)
-        {
-            a[i] = _simd16_extract_ps(b_16[i], 0);
-            b[i] = _simd16_extract_ps(b_16[i], 1);
-            ;
-        }
-    }
-
-#else
-    simdvector& a = PaGetSimdVector(pa, 0, slot);
-
-#endif
-    // Convert from vertical to horizontal.
-    switch (primIndex)
-    {
-    case 0:
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane1(a);
-        verts[2] = swizzleLane2(a);
-        break;
-    case 1:
-        verts[0] = swizzleLane0(a);
-        verts[1] = swizzleLane2(a);
-        verts[2] = _mm_blend_ps(verts[0], verts[1], 0xA);
-        break;
-    case 2:
-    case 3:
-    case 4:
-    case 5:
-    case 6:
-    case 7:
-        SWR_INVALID("Invalid primIndex: %d", primIndex);
-        break;
-    };
-}
-
-PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT*      in_pDC,
-                           uint32_t           in_numPrims,
-                           uint8_t*           pStream,
-                           uint32_t           in_streamSizeInVerts,
-                           uint32_t           in_vertexStride,
-                           bool               in_isStreaming,
-                           uint32_t           numVertsPerPrim,
-                           PRIMITIVE_TOPOLOGY topo) :
-    PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim),
-    numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), cur(0), prev(0), first(0),
-    counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
-{
-    const API_STATE& state = GetApiState(pDC);
-
-    this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
-
-#if ENABLE_AVX512_SIMD16
-    pfnPaFunc_simd16 = nullptr;
-
-#endif
-    switch (this->binTopology)
-    {
-    case TOP_TRIANGLE_LIST:
-        this->pfnPaFunc = PaTriList0;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaTriList0_simd16;
-#endif
-        break;
-    case TOP_TRIANGLE_STRIP:
-        this->pfnPaFunc = PaTriStrip0;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
-#endif
-        break;
-    case TOP_TRIANGLE_FAN:
-        this->pfnPaFunc = PaTriFan0;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaTriFan0_simd16;
-#endif
-        break;
-    case TOP_QUAD_LIST:
-        this->pfnPaFunc = PaQuadList0;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaQuadList0_simd16;
-#endif
-        this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
-        break;
-    case TOP_QUAD_STRIP:
-        // quad strip pattern when decomposed into triangles is the same as verts strips
-        this->pfnPaFunc = PaTriStrip0;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
-#endif
-        this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
-        break;
-    case TOP_LINE_LIST:
-        this->pfnPaFunc = PaLineList0;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaLineList0_simd16;
-#endif
-        this->numPrims = in_numPrims;
-        break;
-    case TOP_LINE_STRIP:
-        this->pfnPaFunc = PaLineStrip0;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaLineStrip0_simd16;
-#endif
-        this->numPrims = in_numPrims;
-        break;
-    case TOP_LINE_LOOP:
-        this->pfnPaFunc = PaLineLoop0;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaLineLoop0_simd16;
-#endif
-        this->numPrims = in_numPrims;
-        break;
-    case TOP_POINT_LIST:
-        this->pfnPaFunc = PaPoints0;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPoints0_simd16;
-#endif
-        this->numPrims = in_numPrims;
-        break;
-    case TOP_RECT_LIST:
-        this->pfnPaFunc = PaRectList0;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaRectList0_simd16;
-#endif
-        this->numPrims = in_numPrims * 2;
-        break;
-
-    case TOP_PATCHLIST_1:
-        this->pfnPaFunc = PaPatchList<1>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<1>;
-#endif
-        break;
-    case TOP_PATCHLIST_2:
-        this->pfnPaFunc = PaPatchList<2>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<2>;
-#endif
-        break;
-    case TOP_PATCHLIST_3:
-        this->pfnPaFunc = PaPatchList<3>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<3>;
-#endif
-        break;
-    case TOP_PATCHLIST_4:
-        this->pfnPaFunc = PaPatchList<4>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<4>;
-#endif
-        break;
-    case TOP_PATCHLIST_5:
-        this->pfnPaFunc = PaPatchList<5>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<5>;
-#endif
-        break;
-    case TOP_PATCHLIST_6:
-        this->pfnPaFunc = PaPatchList<6>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<6>;
-#endif
-        break;
-    case TOP_PATCHLIST_7:
-        this->pfnPaFunc = PaPatchList<7>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<7>;
-#endif
-        break;
-    case TOP_PATCHLIST_8:
-        this->pfnPaFunc = PaPatchList<8>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<8>;
-#endif
-        break;
-    case TOP_PATCHLIST_9:
-        this->pfnPaFunc = PaPatchList<9>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<9>;
-#endif
-        break;
-    case TOP_PATCHLIST_10:
-        this->pfnPaFunc = PaPatchList<10>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<10>;
-#endif
-        break;
-    case TOP_PATCHLIST_11:
-        this->pfnPaFunc = PaPatchList<11>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<11>;
-#endif
-        break;
-    case TOP_PATCHLIST_12:
-        this->pfnPaFunc = PaPatchList<12>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<12>;
-#endif
-        break;
-    case TOP_PATCHLIST_13:
-        this->pfnPaFunc = PaPatchList<13>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<13>;
-#endif
-        break;
-    case TOP_PATCHLIST_14:
-        this->pfnPaFunc = PaPatchList<14>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<14>;
-#endif
-        break;
-    case TOP_PATCHLIST_15:
-        this->pfnPaFunc = PaPatchList<15>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<15>;
-#endif
-        break;
-    case TOP_PATCHLIST_16:
-        this->pfnPaFunc = PaPatchList<16>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<16>;
-#endif
-        break;
-    case TOP_PATCHLIST_17:
-        this->pfnPaFunc = PaPatchList<17>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<17>;
-#endif
-        break;
-    case TOP_PATCHLIST_18:
-        this->pfnPaFunc = PaPatchList<18>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<18>;
-#endif
-        break;
-    case TOP_PATCHLIST_19:
-        this->pfnPaFunc = PaPatchList<19>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<19>;
-#endif
-        break;
-    case TOP_PATCHLIST_20:
-        this->pfnPaFunc = PaPatchList<20>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<20>;
-#endif
-        break;
-    case TOP_PATCHLIST_21:
-        this->pfnPaFunc = PaPatchList<21>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<21>;
-#endif
-        break;
-    case TOP_PATCHLIST_22:
-        this->pfnPaFunc = PaPatchList<22>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<22>;
-#endif
-        break;
-    case TOP_PATCHLIST_23:
-        this->pfnPaFunc = PaPatchList<23>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<23>;
-#endif
-        break;
-    case TOP_PATCHLIST_24:
-        this->pfnPaFunc = PaPatchList<24>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<24>;
-#endif
-        break;
-    case TOP_PATCHLIST_25:
-        this->pfnPaFunc = PaPatchList<25>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<25>;
-#endif
-        break;
-    case TOP_PATCHLIST_26:
-        this->pfnPaFunc = PaPatchList<26>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<26>;
-#endif
-        break;
-    case TOP_PATCHLIST_27:
-        this->pfnPaFunc = PaPatchList<27>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<27>;
-#endif
-        break;
-    case TOP_PATCHLIST_28:
-        this->pfnPaFunc = PaPatchList<28>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<28>;
-#endif
-        break;
-    case TOP_PATCHLIST_29:
-        this->pfnPaFunc = PaPatchList<29>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<29>;
-#endif
-        break;
-    case TOP_PATCHLIST_30:
-        this->pfnPaFunc = PaPatchList<30>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<30>;
-#endif
-        break;
-    case TOP_PATCHLIST_31:
-        this->pfnPaFunc = PaPatchList<31>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<31>;
-#endif
-        break;
-    case TOP_PATCHLIST_32:
-        this->pfnPaFunc = PaPatchList<32>;
-#if ENABLE_AVX512_SIMD16
-        this->pfnPaFunc_simd16 = PaPatchList_simd16<32>;
-#endif
-        break;
-
-    default:
-        SWR_INVALID("Invalid topology: %d", this->binTopology);
-        break;
-    };
-
-    this->pfnPaFuncReset = this->pfnPaFunc;
-#if ENABLE_AVX512_SIMD16
-    this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16;
-#endif
-
-#if USE_SIMD16_FRONTEND
-    simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    simd16scalari id82 = _simd16_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
-
-#else
-    simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-    simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
-
-#endif
-    switch (this->binTopology)
-    {
-    case TOP_TRIANGLE_LIST:
-    case TOP_TRIANGLE_STRIP:
-    case TOP_TRIANGLE_FAN:
-    case TOP_LINE_STRIP:
-    case TOP_LINE_LIST:
-    case TOP_LINE_LOOP:
-#if USE_SIMD16_FRONTEND
-        this->primIDIncr = 16;
-        this->primID     = id16;
-#else
-        this->primIDIncr = 8;
-        this->primID = id8;
-#endif
-        break;
-    case TOP_QUAD_LIST:
-    case TOP_QUAD_STRIP:
-    case TOP_RECT_LIST:
-#if USE_SIMD16_FRONTEND
-        this->primIDIncr = 8;
-        this->primID     = id82;
-#else
-        this->primIDIncr = 4;
-        this->primID = id4;
-#endif
-        break;
-    case TOP_POINT_LIST:
-#if USE_SIMD16_FRONTEND
-        this->primIDIncr = 16;
-        this->primID     = id16;
-#else
-        this->primIDIncr = 8;
-        this->primID = id8;
-#endif
-        break;
-    case TOP_PATCHLIST_1:
-    case TOP_PATCHLIST_2:
-    case TOP_PATCHLIST_3:
-    case TOP_PATCHLIST_4:
-    case TOP_PATCHLIST_5:
-    case TOP_PATCHLIST_6:
-    case TOP_PATCHLIST_7:
-    case TOP_PATCHLIST_8:
-    case TOP_PATCHLIST_9:
-    case TOP_PATCHLIST_10:
-    case TOP_PATCHLIST_11:
-    case TOP_PATCHLIST_12:
-    case TOP_PATCHLIST_13:
-    case TOP_PATCHLIST_14:
-    case TOP_PATCHLIST_15:
-    case TOP_PATCHLIST_16:
-    case TOP_PATCHLIST_17:
-    case TOP_PATCHLIST_18:
-    case TOP_PATCHLIST_19:
-    case TOP_PATCHLIST_20:
-    case TOP_PATCHLIST_21:
-    case TOP_PATCHLIST_22:
-    case TOP_PATCHLIST_23:
-    case TOP_PATCHLIST_24:
-    case TOP_PATCHLIST_25:
-    case TOP_PATCHLIST_26:
-    case TOP_PATCHLIST_27:
-    case TOP_PATCHLIST_28:
-    case TOP_PATCHLIST_29:
-    case TOP_PATCHLIST_30:
-    case TOP_PATCHLIST_31:
-    case TOP_PATCHLIST_32:
-        // Always run KNOB_SIMD_WIDTH number of patches at a time.
-#if USE_SIMD16_FRONTEND
-        this->primIDIncr = 16;
-        this->primID     = id16;
-#else
-        this->primIDIncr = 8;
-        this->primID = id8;
-#endif
-        break;
-
-    default:
-        SWR_INVALID("Invalid topology: %d", this->binTopology);
-        break;
-    };
-}
-#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
deleted file mode 100644
index c14cd56e52e..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ /dev/null
@@ -1,473 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rasterizer.cpp
- *
- * @brief Implementation for the rasterizer.
- *
- ******************************************************************************/
-
-#include <vector>
-#include <algorithm>
-
-#include "rasterizer.h"
-#include "backends/gen_rasterizer.hpp"
-#include "rdtsc_core.h"
-#include "backend.h"
-#include "utils.h"
-#include "frontend.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-#include "rasterizer_impl.h"
-
-PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
-                              [STATE_VALID_TRI_EDGE_COUNT][2];
-
-void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
-    const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pData);
-#if KNOB_ENABLE_TOSS_POINTS
-    if (KNOB_TOSS_BIN_TRIS)
-    {
-        return;
-    }
-#endif
-
-    // bloat line to two tris and call the triangle rasterizer twice
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, pDC->drawId);
-
-    const API_STATE&     state     = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-
-    // macrotile dimensioning
-    uint32_t macroX, macroY;
-    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
-    int32_t macroBoxLeft   = macroX * KNOB_MACROTILE_X_DIM_FIXED;
-    int32_t macroBoxRight  = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
-    int32_t macroBoxTop    = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
-    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
-
-    const SWR_RECT& scissorInFixedPoint =
-        state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
-
-    // create a copy of the triangle buffer to write our adjusted vertices to
-    OSALIGNSIMD(float) newTriBuffer[4 * 4];
-    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
-    newWorkDesc.pTriBuffer         = &newTriBuffer[0];
-
-    // create a copy of the attrib buffer to write our adjusted attribs to
-    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
-    newWorkDesc.pAttribs = &newAttribBuffer[0];
-
-    const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
-    const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
-
-    __m128 vX, vY, vZ, vRecipW;
-
-    vX      = _mm_load_ps(workDesc.pTriBuffer);
-    vY      = _mm_load_ps(workDesc.pTriBuffer + 4);
-    vZ      = _mm_load_ps(workDesc.pTriBuffer + 8);
-    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
-
-    // triangle 0
-    // v0,v1 -> v0,v0,v1
-    __m128 vXa      = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vYa      = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vZa      = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
-
-    __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
-    __m128 vAdjust    = _mm_mul_ps(vLineWidth, vBloat0);
-    if (workDesc.triFlags.yMajor)
-    {
-        vXa = _mm_add_ps(vAdjust, vXa);
-    }
-    else
-    {
-        vYa = _mm_add_ps(vAdjust, vYa);
-    }
-
-    // Store triangle description for rasterizer
-    _mm_store_ps((float*)&newTriBuffer[0], vXa);
-    _mm_store_ps((float*)&newTriBuffer[4], vYa);
-    _mm_store_ps((float*)&newTriBuffer[8], vZa);
-    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
-
-    // binner bins 3 edges for lines as v0, v1, v1
-    // tri0 needs v0, v0, v1
-    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
-    {
-        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
-        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
-
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
-    }
-
-    // Store user clip distances for triangle 0
-    float    newClipBuffer[3 * 8];
-    uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
-    if (numClipDist)
-    {
-        newWorkDesc.pUserClipBuffer = newClipBuffer;
-
-        float* pOldBuffer = workDesc.pUserClipBuffer;
-        float* pNewBuffer = newClipBuffer;
-        for (uint32_t i = 0; i < numClipDist; ++i)
-        {
-            // read barycentric coeffs from binner
-            float a = *(pOldBuffer++);
-            float b = *(pOldBuffer++);
-
-            // reconstruct original clip distance at vertices
-            float c0 = a + b;
-            float c1 = b;
-
-            // construct triangle barycentrics
-            *(pNewBuffer++) = c0 - c1;
-            *(pNewBuffer++) = c0 - c1;
-            *(pNewBuffer++) = c1;
-        }
-    }
-
-    // setup triangle rasterizer function
-    PFN_WORK_FUNC pfnTriRast;
-    // conservative rast not supported for points/lines
-    pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
-                                   rastState.bIsCenterPattern,
-                                   false,
-                                   SWR_INPUT_COVERAGE_NONE,
-                                   EdgeValToEdgeState(ALL_EDGES_VALID),
-                                   (pDC->pState->state.scissorsTileAligned == false));
-
-    // make sure this macrotile intersects the triangle
-    __m128i vXai = fpToFixedPoint(vXa);
-    __m128i vYai = fpToFixedPoint(vYa);
-    OSALIGNSIMD(SWR_RECT) bboxA;
-    calcBoundingBoxInt(vXai, vYai, bboxA);
-
-    if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
-          bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
-          bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
-          bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
-    {
-        // rasterize triangle
-        pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-    }
-
-    // triangle 1
-    // v0,v1 -> v1,v1,v0
-    vXa      = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
-    vYa      = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
-    vZa      = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
-    vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
-
-    vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
-    if (workDesc.triFlags.yMajor)
-    {
-        vXa = _mm_add_ps(vAdjust, vXa);
-    }
-    else
-    {
-        vYa = _mm_add_ps(vAdjust, vYa);
-    }
-
-    // Store triangle description for rasterizer
-    _mm_store_ps((float*)&newTriBuffer[0], vXa);
-    _mm_store_ps((float*)&newTriBuffer[4], vYa);
-    _mm_store_ps((float*)&newTriBuffer[8], vZa);
-    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
-
-    // binner bins 3 edges for lines as v0, v1, v1
-    // tri1 needs v1, v1, v0
-    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
-    {
-        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
-        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
-
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
-    }
-
-    // store user clip distance for triangle 1
-    if (numClipDist)
-    {
-        float* pOldBuffer = workDesc.pUserClipBuffer;
-        float* pNewBuffer = newClipBuffer;
-        for (uint32_t i = 0; i < numClipDist; ++i)
-        {
-            // read barycentric coeffs from binner
-            float a = *(pOldBuffer++);
-            float b = *(pOldBuffer++);
-
-            // reconstruct original clip distance at vertices
-            float c0 = a + b;
-            float c1 = b;
-
-            // construct triangle barycentrics
-            *(pNewBuffer++) = c1 - c0;
-            *(pNewBuffer++) = c1 - c0;
-            *(pNewBuffer++) = c0;
-        }
-    }
-
-    vXai = fpToFixedPoint(vXa);
-    vYai = fpToFixedPoint(vYa);
-    calcBoundingBoxInt(vXai, vYai, bboxA);
-
-    if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
-          bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
-          bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
-          bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
-    {
-        // rasterize triangle
-        pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-    }
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, 1);
-}
-
-void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
-#if KNOB_ENABLE_TOSS_POINTS
-    if (KNOB_TOSS_BIN_TRIS)
-    {
-        return;
-    }
-#endif
-
-    const TRIANGLE_WORK_DESC& workDesc     = *(const TRIANGLE_WORK_DESC*)pData;
-    const BACKEND_FUNCS&      backendFuncs = pDC->pState->backendFuncs;
-
-    // map x,y relative offsets from start of raster tile to bit position in
-    // coverage mask for the point
-    static const uint32_t coverageMap[8][8] = {{0, 1, 4, 5, 8, 9, 12, 13},
-                                               {2, 3, 6, 7, 10, 11, 14, 15},
-                                               {16, 17, 20, 21, 24, 25, 28, 29},
-                                               {18, 19, 22, 23, 26, 27, 30, 31},
-                                               {32, 33, 36, 37, 40, 41, 44, 45},
-                                               {34, 35, 38, 39, 42, 43, 46, 47},
-                                               {48, 49, 52, 53, 56, 57, 60, 61},
-                                               {50, 51, 54, 55, 58, 59, 62, 63}};
-
-    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc = {};
-
-    // pull point information from triangle buffer
-    // @todo use structs for readability
-    uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
-    uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
-    float    z            = *(workDesc.pTriBuffer + 2);
-
-    // construct triangle descriptor for point
-    // no interpolation, set up i,j for constant interpolation of z and attribs
-    // @todo implement an optimized backend that doesn't require triangle information
-
-    // compute coverage mask from x,y packed into the coverageMask flag
-    // mask indices by the maximum valid index for x/y of coveragemap.
-    uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
-    uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
-    for (uint32_t i = 0; i < _countof(triDesc.coverageMask); ++i)
-    {
-        triDesc.coverageMask[i] = 1ULL << coverageMap[tY][tX];
-    }
-    triDesc.anyCoveredSamples = triDesc.coverageMask[0];
-    triDesc.innerCoverageMask = triDesc.coverageMask[0];
-
-    // no persp divide needed for points
-    triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
-    triDesc.triFlags                         = workDesc.triFlags;
-    triDesc.recipDet                         = 1.0f;
-    triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
-    triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
-    triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
-    triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
-
-    RenderOutputBuffers renderBuffers;
-    GetRenderHotTiles(pDC,
-                      workerId,
-                      macroTile,
-                      tileAlignedX >> KNOB_TILE_X_DIM_SHIFT,
-                      tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
-                      renderBuffers,
-                      triDesc.triFlags.renderTargetArrayIndex);
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId);
-    backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
-    RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0);
-}
-
-void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
-    const TRIANGLE_WORK_DESC& workDesc     = *(const TRIANGLE_WORK_DESC*)pData;
-    const SWR_RASTSTATE&      rastState    = pDC->pState->state.rastState;
-    const SWR_BACKEND_STATE&  backendState = pDC->pState->state.backendState;
-
-    bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
-
-    // load point vertex
-    float x = *workDesc.pTriBuffer;
-    float y = *(workDesc.pTriBuffer + 1);
-    float z = *(workDesc.pTriBuffer + 2);
-
-    // create a copy of the triangle buffer to write our adjusted vertices to
-    OSALIGNSIMD(float) newTriBuffer[4 * 4];
-    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
-    newWorkDesc.pTriBuffer         = &newTriBuffer[0];
-
-    // create a copy of the attrib buffer to write our adjusted attribs to
-    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
-    newWorkDesc.pAttribs = &newAttribBuffer[0];
-
-    newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
-    newWorkDesc.numAttribs      = workDesc.numAttribs;
-    newWorkDesc.triFlags        = workDesc.triFlags;
-
-    // construct two tris by bloating point by point size
-    float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
-    float lowerX        = x - halfPointSize;
-    float upperX        = x + halfPointSize;
-    float lowerY        = y - halfPointSize;
-    float upperY        = y + halfPointSize;
-
-    // tri 0
-    float* pBuf = &newTriBuffer[0];
-    *pBuf++     = lowerX;
-    *pBuf++     = lowerX;
-    *pBuf++     = upperX;
-    pBuf++;
-    *pBuf++ = lowerY;
-    *pBuf++ = upperY;
-    *pBuf++ = upperY;
-    pBuf++;
-    _mm_store_ps(pBuf, _mm_set1_ps(z));
-    _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
-
-    // setup triangle rasterizer function
-    PFN_WORK_FUNC pfnTriRast;
-    // conservative rast not supported for points/lines
-    pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
-                                   rastState.bIsCenterPattern,
-                                   false,
-                                   SWR_INPUT_COVERAGE_NONE,
-                                   EdgeValToEdgeState(ALL_EDGES_VALID),
-                                   (pDC->pState->state.scissorsTileAligned == false));
-
-    // overwrite texcoords for point sprites
-    if (isPointSpriteTexCoordEnabled)
-    {
-        // copy original attribs
-        memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
-        newWorkDesc.pAttribs = &newAttribBuffer[0];
-
-        // overwrite texcoord for point sprites
-        uint32_t texCoordMask   = backendState.pointSpriteTexCoordMask;
-        unsigned long texCoordAttrib = 0;
-
-        while (_BitScanForward(&texCoordAttrib, texCoordMask))
-        {
-            texCoordMask &= ~(1 << texCoordAttrib);
-            __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
-            if (rastState.pointSpriteTopOrigin)
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
-            }
-            else
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
-            }
-        }
-    }
-    else
-    {
-        // no texcoord overwrite, can reuse the attrib buffer from frontend
-        newWorkDesc.pAttribs = workDesc.pAttribs;
-    }
-
-    pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-
-    // tri 1
-    pBuf    = &newTriBuffer[0];
-    *pBuf++ = lowerX;
-    *pBuf++ = upperX;
-    *pBuf++ = upperX;
-    pBuf++;
-    *pBuf++ = lowerY;
-    *pBuf++ = upperY;
-    *pBuf++ = lowerY;
-    // z, w unchanged
-
-    if (isPointSpriteTexCoordEnabled)
-    {
-        uint32_t texCoordMask   = backendState.pointSpriteTexCoordMask;
-        unsigned long texCoordAttrib = 0;
-
-        while (_BitScanForward(&texCoordAttrib, texCoordMask))
-        {
-            texCoordMask &= ~(1 << texCoordAttrib);
-            __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
-            if (rastState.pointSpriteTopOrigin)
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
-            }
-            else
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
-            }
-        }
-    }
-
-    pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-}
-
-void InitRasterizerFunctions()
-{
-    InitRasterizerFuncs();
-}
-
-// Selector for correct templated RasterizeTriangle function
-PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
-                                bool                  IsCenter,
-                                bool                  IsConservative,
-                                SWR_INPUT_COVERAGE    InputCoverage,
-                                uint32_t              EdgeEnable,
-                                bool                  RasterizeScissorEdges)
-{
-    SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
-    SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
-    SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
-
-    PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage]
-                                         [EdgeEnable][RasterizeScissorEdges];
-    SWR_ASSERT(func);
-
-    return func;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
deleted file mode 100644
index f15cc193129..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rasterizer.h
- *
- * @brief Definitions for the rasterizer.
- *
- ******************************************************************************/
-#pragma once
-
-#include "context.h"
-#include <type_traits>
-#include "conservativeRast.h"
-#include "multisample.h"
-
-void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
-void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
-void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
-void InitRasterizerFunctions();
-
-INLINE
-__m128i fpToFixedPoint(const __m128 vIn)
-{
-    __m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE));
-    return _mm_cvtps_epi32(vFixed);
-}
-
-enum TriEdgesStates
-{
-    STATE_NO_VALID_EDGES = 0,
-    STATE_E0_E1_VALID,
-    STATE_E0_E2_VALID,
-    STATE_E1_E2_VALID,
-    STATE_ALL_EDGES_VALID,
-    STATE_VALID_TRI_EDGE_COUNT,
-};
-
-enum TriEdgesValues
-{
-    NO_VALID_EDGES  = 0,
-    E0_E1_VALID     = 0x3,
-    E0_E2_VALID     = 0x5,
-    E1_E2_VALID     = 0x6,
-    ALL_EDGES_VALID = 0x7,
-    VALID_TRI_EDGE_COUNT,
-};
-
-// Selector for correct templated RasterizeTriangle function
-PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
-                                bool                  IsCenter,
-                                bool                  IsConservative,
-                                SWR_INPUT_COVERAGE    InputCoverage,
-                                uint32_t              EdgeEnable,
-                                bool                  RasterizeScissorEdges);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ValidTriEdges convenience typedefs used for templated function
-/// specialization supported Fixed Point precisions
-typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> AllEdgesValidT;
-typedef std::integral_constant<uint32_t, E0_E1_VALID>     E0E1ValidT;
-typedef std::integral_constant<uint32_t, E0_E2_VALID>     E0E2ValidT;
-typedef std::integral_constant<uint32_t, E1_E2_VALID>     E1E2ValidT;
-typedef std::integral_constant<uint32_t, NO_VALID_EDGES>  NoEdgesValidT;
-
-typedef std::integral_constant<uint32_t, STATE_ALL_EDGES_VALID> StateAllEdgesValidT;
-typedef std::integral_constant<uint32_t, STATE_E0_E1_VALID>     StateE0E1ValidT;
-typedef std::integral_constant<uint32_t, STATE_E0_E2_VALID>     StateE0E2ValidT;
-typedef std::integral_constant<uint32_t, STATE_E1_E2_VALID>     StateE1E2ValidT;
-typedef std::integral_constant<uint32_t, STATE_NO_VALID_EDGES>  StateNoEdgesValidT;
-
-// some specializations to convert from edge state to edge bitmask values
-template <typename EdgeMask>
-struct EdgeMaskVal
-{
-    static_assert(EdgeMask::value > STATE_ALL_EDGES_VALID,
-                  "Primary EdgeMaskVal shouldn't be instantiated");
-};
-
-template <>
-struct EdgeMaskVal<StateAllEdgesValidT>
-{
-    typedef AllEdgesValidT T;
-};
-
-template <>
-struct EdgeMaskVal<StateE0E1ValidT>
-{
-    typedef E0E1ValidT T;
-};
-
-template <>
-struct EdgeMaskVal<StateE0E2ValidT>
-{
-    typedef E0E2ValidT T;
-};
-
-template <>
-struct EdgeMaskVal<StateE1E2ValidT>
-{
-    typedef E1E2ValidT T;
-};
-
-template <>
-struct EdgeMaskVal<StateNoEdgesValidT>
-{
-    typedef NoEdgesValidT T;
-};
-
-INLINE uint32_t EdgeValToEdgeState(uint32_t val)
-{
-    SWR_ASSERT(val < VALID_TRI_EDGE_COUNT, "Unexpected tri edge mask");
-    static const uint32_t edgeValToEdgeState[VALID_TRI_EDGE_COUNT] = {0, 0, 0, 1, 0, 2, 3, 4};
-    return edgeValToEdgeState[val];
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct RasterScissorEdgesT
-/// @brief Primary RasterScissorEdgesT templated struct that holds compile
-/// time information about the number of edges needed to be rasterized,
-/// If either the scissor rect or conservative rast is enabled,
-/// the scissor test is enabled and the rasterizer will test
-/// 3 triangle edges + 4 scissor edges for coverage.
-/// @tparam RasterScissorEdgesT: number of multisamples
-/// @tparam ConservativeT: is this a conservative rasterization
-/// @tparam EdgeMaskT: Which edges are valid(not degenerate)
-template <typename RasterScissorEdgesT, typename ConservativeT, typename EdgeMaskT>
-struct RasterEdgeTraits
-{
-    typedef std::true_type                      RasterizeScissorEdgesT;
-    typedef std::integral_constant<uint32_t, 7> NumEdgesT;
-    // typedef std::integral_constant<uint32_t, EdgeMaskT::value> ValidEdgeMaskT;
-    typedef typename EdgeMaskVal<EdgeMaskT>::T ValidEdgeMaskT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief specialization of RasterEdgeTraits. If neither scissor rect
-/// nor conservative rast is enabled, only test 3 triangle edges
-/// for coverage
-template <typename EdgeMaskT>
-struct RasterEdgeTraits<std::false_type, std::false_type, EdgeMaskT>
-{
-    typedef std::false_type                     RasterizeScissorEdgesT;
-    typedef std::integral_constant<uint32_t, 3> NumEdgesT;
-    // no need for degenerate edge masking in non-conservative case; rasterize all triangle edges
-    typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> ValidEdgeMaskT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct RasterizerTraits
-/// @brief templated struct that holds compile time information used
-/// during rasterization. Inherits EdgeTraits and ConservativeRastBETraits.
-/// @tparam NumSamplesT: number of multisamples
-/// @tparam ConservativeT: is this a conservative rasterization
-/// @tparam InputCoverageT: what type of input coverage is the PS expecting?
-/// (only used with conservative rasterization)
-/// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor?
-template <typename NumSamplesT,
-          typename CenterPatternT,
-          typename ConservativeT,
-          typename InputCoverageT,
-          typename EdgeEnableT,
-          typename RasterScissorEdgesT>
-struct _RasterizerTraits : public ConservativeRastBETraits<ConservativeT, InputCoverageT>,
-                           public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT>
-{
-    typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value),
-                              CenterPatternT::value>
-        MT;
-
-    /// Fixed point precision the rasterizer is using
-    typedef FixedPointTraits<Fixed_16_8> PrecisionT;
-    /// Fixed point precision of the edge tests used during rasterization
-    typedef FixedPointTraits<Fixed_X_16> EdgePrecisionT;
-
-    // If conservative rast or MSAA center pattern is enabled, only need a single sample coverage
-    // test, with the result copied to all samples
-    typedef std::integral_constant<int, ConservativeT::value ? 1 : MT::numCoverageSamples>
-        NumCoverageSamplesT;
-
-    static_assert(
-        EdgePrecisionT::BitsT::value >=
-            ConservativeRastBETraits<ConservativeT,
-                                     InputCoverageT>::ConservativePrecisionT::BitsT::value,
-        "Rasterizer edge fixed point precision < required conservative rast precision");
-
-    /// constants used to offset between different types of raster tiles
-    static const int colorRasterTileStep{
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) *
-        MT::numSamples};
-    static const int depthRasterTileStep{
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) *
-        MT::numSamples};
-    static const int stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM *
-                                            (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) *
-                                           MT::numSamples};
-    static const int colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
-                                            colorRasterTileStep};
-    static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
-                                            depthRasterTileStep};
-    static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
-                                              stencilRasterTileStep};
-};
-
-template <uint32_t NumSamplesT,
-          uint32_t CenterPatternT,
-          uint32_t ConservativeT,
-          uint32_t InputCoverageT,
-          uint32_t EdgeEnableT,
-          uint32_t RasterScissorEdgesT>
-struct RasterizerTraits final
-    : public _RasterizerTraits<std::integral_constant<uint32_t, NumSamplesT>,
-                               std::integral_constant<bool, CenterPatternT != 0>,
-                               std::integral_constant<bool, ConservativeT != 0>,
-                               std::integral_constant<uint32_t, InputCoverageT>,
-                               std::integral_constant<uint32_t, EdgeEnableT>,
-                               std::integral_constant<bool, RasterScissorEdgesT != 0>>
-{
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
deleted file mode 100644
index 2153fe653b1..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
+++ /dev/null
@@ -1,1542 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rasterizer.cpp
- *
- * @brief Implementation for the rasterizer.
- *
- ******************************************************************************/
-
-#include <vector>
-#include <algorithm>
-
-#include "rasterizer.h"
-#include "rdtsc_core.h"
-#include "backend.h"
-#include "utils.h"
-#include "frontend.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-
-extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
-                                     [STATE_VALID_TRI_EDGE_COUNT][2];
-
-template <uint32_t numSamples = 1>
-void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
-                       uint32_t             workerId,
-                       uint32_t             macroID,
-                       uint32_t             x,
-                       uint32_t             y,
-                       RenderOutputBuffers& renderBuffers,
-                       uint32_t             renderTargetArrayIndex);
-template <typename RT>
-void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers);
-template <typename RT>
-void StepRasterTileY(uint32_t             colorHotTileMask,
-                     RenderOutputBuffers& buffers,
-                     RenderOutputBuffers& startBufferRow);
-
-#define MASKTOVEC(i3, i2, i1, i0) \
-    {                             \
-        -i0, -i1, -i2, -i3        \
-    }
-static const __m256d gMaskToVecpd[] = {
-    MASKTOVEC(0, 0, 0, 0),
-    MASKTOVEC(0, 0, 0, 1),
-    MASKTOVEC(0, 0, 1, 0),
-    MASKTOVEC(0, 0, 1, 1),
-    MASKTOVEC(0, 1, 0, 0),
-    MASKTOVEC(0, 1, 0, 1),
-    MASKTOVEC(0, 1, 1, 0),
-    MASKTOVEC(0, 1, 1, 1),
-    MASKTOVEC(1, 0, 0, 0),
-    MASKTOVEC(1, 0, 0, 1),
-    MASKTOVEC(1, 0, 1, 0),
-    MASKTOVEC(1, 0, 1, 1),
-    MASKTOVEC(1, 1, 0, 0),
-    MASKTOVEC(1, 1, 0, 1),
-    MASKTOVEC(1, 1, 1, 0),
-    MASKTOVEC(1, 1, 1, 1),
-};
-
-struct POS
-{
-    int32_t x, y;
-};
-
-struct EDGE
-{
-    double a, b;            // a, b edge coefficients in fix8
-    double stepQuadX;       // step to adjacent horizontal quad in fix16
-    double stepQuadY;       // step to adjacent vertical quad in fix16
-    double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
-    double stepRasterTileY; // step to adjacent vertical raster tile in fix16
-
-    __m256d vQuadOffsets;       // offsets for 4 samples of a quad
-    __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief rasterize a raster tile partially covered by the triangle
-/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster
-/// tile
-/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
-/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
-///        Used to step between quads when sweeping over the raster tile.
-template <uint32_t NumEdges, typename EdgeMaskT>
-INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT* pDC,
-                                     double        startEdges[NumEdges],
-                                     EDGE*         pRastEdges)
-{
-    uint64_t coverageMask = 0;
-
-    __m256d vEdges[NumEdges];
-    __m256d vStepX[NumEdges];
-    __m256d vStepY[NumEdges];
-
-    for (uint32_t e = 0; e < NumEdges; ++e)
-    {
-        // Step to the pixel sample locations of the 1st quad
-        vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
-
-        // compute step to next quad (mul by 2 in x and y direction)
-        vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
-        vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
-    }
-
-    // fast unrolled version for 8x8 tile
-#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
-    int      edgeMask[NumEdges];
-    uint64_t mask;
-
-    auto eval_lambda   = [&](int e) { edgeMask[e] = _mm256_movemask_pd(vEdges[e]); };
-    auto update_lambda = [&](int e) { mask &= edgeMask[e]; };
-    auto incx_lambda   = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); };
-    auto incy_lambda   = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]); };
-    auto decx_lambda   = [&](int e) { vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]); };
-
-// evaluate which pixels in the quad are covered
-#define EVAL UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
-
-    // update coverage mask
-    // if edge 0 is degenerate and will be skipped; init the mask
-#define UPDATE_MASK(bit)                                                  \
-    if (std::is_same<EdgeMaskT, E1E2ValidT>::value ||                     \
-        std::is_same<EdgeMaskT, NoEdgesValidT>::value)                    \
-    {                                                                     \
-        mask = 0xf;                                                       \
-    }                                                                     \
-    else                                                                  \
-    {                                                                     \
-        mask = edgeMask[0];                                               \
-    }                                                                     \
-    UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
-    coverageMask |= (mask << bit);
-
-    // step in the +x direction to the next quad
-#define INCX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
-
-    // step in the +y direction to the next quad
-#define INCY UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
-
-    // step in the -x direction to the next quad
-#define DECX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
-
-    // sweep 2x2 quad back and forth through the raster tile,
-    // computing coverage masks for the entire tile
-
-    // raster tile
-    // 0  1  2  3  4  5  6  7
-    // x  x
-    // x  x ------------------>
-    //                   x  x  |
-    // <-----------------x  x  V
-    // ..
-
-    // row 0
-    EVAL;
-    UPDATE_MASK(0);
-    INCX;
-    EVAL;
-    UPDATE_MASK(4);
-    INCX;
-    EVAL;
-    UPDATE_MASK(8);
-    INCX;
-    EVAL;
-    UPDATE_MASK(12);
-    INCY;
-
-    // row 1
-    EVAL;
-    UPDATE_MASK(28);
-    DECX;
-    EVAL;
-    UPDATE_MASK(24);
-    DECX;
-    EVAL;
-    UPDATE_MASK(20);
-    DECX;
-    EVAL;
-    UPDATE_MASK(16);
-    INCY;
-
-    // row 2
-    EVAL;
-    UPDATE_MASK(32);
-    INCX;
-    EVAL;
-    UPDATE_MASK(36);
-    INCX;
-    EVAL;
-    UPDATE_MASK(40);
-    INCX;
-    EVAL;
-    UPDATE_MASK(44);
-    INCY;
-
-    // row 3
-    EVAL;
-    UPDATE_MASK(60);
-    DECX;
-    EVAL;
-    UPDATE_MASK(56);
-    DECX;
-    EVAL;
-    UPDATE_MASK(52);
-    DECX;
-    EVAL;
-    UPDATE_MASK(48);
-#else
-    uint32_t bit = 0;
-    for (uint32_t y = 0; y < KNOB_TILE_Y_DIM / 2; ++y)
-    {
-        __m256d vStartOfRowEdge[NumEdges];
-        for (uint32_t e = 0; e < NumEdges; ++e)
-        {
-            vStartOfRowEdge[e] = vEdges[e];
-        }
-
-        for (uint32_t x = 0; x < KNOB_TILE_X_DIM / 2; ++x)
-        {
-            int edgeMask[NumEdges];
-            for (uint32_t e = 0; e < NumEdges; ++e)
-            {
-                edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
-            }
-
-            uint64_t mask = edgeMask[0];
-            for (uint32_t e = 1; e < NumEdges; ++e)
-            {
-                mask &= edgeMask[e];
-            }
-            coverageMask |= (mask << bit);
-
-            // step to the next pixel in the x
-            for (uint32_t e = 0; e < NumEdges; ++e)
-            {
-                vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
-            }
-            bit += 4;
-        }
-
-        // step to the next row
-        for (uint32_t e = 0; e < NumEdges; ++e)
-        {
-            vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
-        }
-    }
-#endif
-    return coverageMask;
-}
-// Top left rule:
-// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
-// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it
-// is a 'left' edge Top left: a sample is in if it is a top or left edge. Out: !(horizontal &&
-// above) = !horizontal && below Out: !horizontal && left = !(!horizontal && left) = horizontal and
-// right
-INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d& vEdge)
-{
-    // if vA < 0, vC--
-    // if vA == 0 && vB < 0, vC--
-
-    __m256d vEdgeOut    = vEdge;
-    __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
-
-    // if vA < 0 (line is not horizontal and below)
-    int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
-
-    // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
-    __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
-    int     msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
-    msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
-
-    // if either of these are true and we're on the line (edge == 0), bump it outside the line
-    vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief calculates difference in precision between the result of manh
-/// calculation and the edge precision, based on compile time trait values
-template <typename RT>
-constexpr int64_t ManhToEdgePrecisionAdjust()
-{
-    static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
-                      RT::EdgePrecisionT::BitsT::value,
-                  "Inadequate precision of result of manh calculation ");
-    return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) -
-            RT::EdgePrecisionT::BitsT::value);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct adjustEdgeConservative
-/// @brief Primary template definition used for partially specializing
-/// the adjustEdgeConservative function. This struct should never
-/// be instantiated.
-/// @tparam RT: rasterizer traits
-/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
-template <typename RT, typename ConservativeEdgeOffsetT>
-struct adjustEdgeConservative
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs calculations to adjust each edge of a triangle away
-    /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-    /// direction.
-    ///
-    /// Uncertainty regions arise from fixed point rounding, which
-    /// can snap a vertex +/- by min fixed point value.
-    /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
-    /// This allows the rasterizer to test for coverage only at the pixel center,
-    /// instead of having to test individual pixel corners for conservative coverage
-    INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
-    {
-        // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge
-        // away from the pixel center (in the direction of the edge normal A/B)
-
-        // edge = Ax + Bx + C - (manh/e)
-        // manh = manhattan distance = abs(A) + abs(B)
-        // e = absolute rounding error from snapping from float to fixed point precision
-
-        // 'fixed point' multiply (in double to be avx1 friendly)
-        // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
-        __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)),
-                vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
-        __m256d manh =
-            _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
-                          _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
-
-        static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
-                          RT::EdgePrecisionT::BitsT::value,
-                      "Inadequate precision of result of manh calculation ");
-
-        // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the
-        // same precision since we're doing fixed math in double format, multiply by multiples of
-        // 1/2 instead of a bit shift right
-        manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
-
-        // move the edge away from the pixel center by the required conservative precision + 1/2
-        // pixel this allows the rasterizer to do a single conservative coverage test to see if the
-        // primitive intersects the pixel at all
-        vEdge = _mm256_sub_pd(vEdge, manh);
-    };
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief adjustEdgeConservative specialization where no edge offset is needed
-template <typename RT>
-struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
-{
-    INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief calculates the distance a degenerate BBox needs to be adjusted
-/// for conservative rast based on compile time trait values
-template <typename RT>
-constexpr int64_t ConservativeScissorOffset()
-{
-    static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0,
-                  "Rasterizer precision > conservative precision");
-    // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox
-    // when calculating scissor edges
-    typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1>
-        DegenerateEdgeOffsetT;
-    // 1/2 pixel edge offset + conservative offset - degenerateTriangle
-    return RT::ConservativeEdgeOffsetT::value -
-           (DegenerateEdgeOffsetT::value
-            << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Performs calculations to adjust each a vector of evaluated edges out
-/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-/// direction.
-template <typename RT>
-INLINE void adjustScissorEdge(const double a, const double b, __m256d& vEdge)
-{
-    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
-    int64_t manh =
-        ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >>
-        ManhToEdgePrecisionAdjust<RT>();
-    vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Performs calculations to adjust each a scalar evaluated edge out
-/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-/// direction.
-template <typename RT, typename OffsetT>
-INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
-{
-    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
-    int64_t manh =
-        ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
-    return (Edge - manh);
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Perform any needed adjustments to evaluated triangle edges
-template <typename RT, typename EdgeOffsetT>
-struct adjustEdgesFix16
-{
-    INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
-    {
-        static_assert(
-            std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
-            "Edge equation expected to be in x.16 fixed point");
-
-        static_assert(RT::IsConservativeT::value,
-                      "Edge offset assumes conservative rasterization is enabled");
-
-        // need to apply any edge offsets before applying the top-left rule
-        adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
-
-        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Perform top left adjustments to evaluated triangle edges
-template <typename RT>
-struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
-{
-    INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
-    {
-        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
-    }
-};
-
-// max(abs(dz/dx), abs(dz,dy)
-INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
-{
-    /*
-    // evaluate i,j at (0,0)
-    float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
-    float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
-
-    // evaluate i,j at (1,0)
-    float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
-    float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
-
-    // compute dz/dx
-    float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
-    float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
-    float dzdx = abs(d10 - d00);
-
-    // evaluate i,j at (0,1)
-    float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
-    float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
-
-    float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
-    float dzdy = abs(d01 - d00);
-    */
-
-    // optimized version of above
-    float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
-    float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
-
-    return std::max(dzdx, dzdy);
-}
-
-INLINE float
-ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
-{
-    if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
-    {
-        return (1.0f / (1 << 24));
-    }
-    else if (pState->depthFormat == R16_UNORM)
-    {
-        return (1.0f / (1 << 16));
-    }
-    else
-    {
-        SWR_ASSERT(pState->depthFormat == R32_FLOAT);
-
-        // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
-        float    zMax    = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
-        uint32_t zMaxInt = *(uint32_t*)&zMax;
-        zMaxInt &= 0x7f800000;
-        zMax = *(float*)&zMaxInt;
-
-        return zMax * (1.0f / (1 << 23));
-    }
-}
-
-INLINE float
-ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
-{
-    if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
-    {
-        return 0.0f;
-    }
-
-    float scale = pState->slopeScaledDepthBias;
-    if (scale != 0.0f)
-    {
-        scale *= ComputeMaxDepthSlope(pTri);
-    }
-
-    float bias = pState->depthBias;
-    if (!pState->depthBiasPreAdjusted)
-    {
-        bias *= ComputeBiasFactor(pState, pTri, z);
-    }
-    bias += scale;
-
-    if (pState->depthBiasClamp > 0.0f)
-    {
-        bias = std::min(bias, pState->depthBiasClamp);
-    }
-    else if (pState->depthBiasClamp < 0.0f)
-    {
-        bias = std::max(bias, pState->depthBiasClamp);
-    }
-
-    return bias;
-}
-
-// Prevent DCE by writing coverage mask from rasterizer to volatile
-#if KNOB_ENABLE_TOSS_POINTS
-__declspec(thread) volatile uint64_t gToss;
-#endif
-
-static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
-// try to avoid _chkstk insertions; make this thread local
-static THREAD
-OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
-
-INLINE
-void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
-{
-    edge.a = a;
-    edge.b = b;
-
-    // compute constant steps to adjacent quads
-    edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
-    edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
-
-    // compute constant steps to adjacent raster tiles
-    edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
-    edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
-
-    // compute quad offsets
-    const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
-    const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
-
-    __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
-    __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
-    edge.vQuadOffsets       = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
-
-    // compute raster tile offsets
-    const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd(
-        (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0);
-    const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd(
-        (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, 0, 0);
-
-    __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
-    __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
-    edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
-}
-
-INLINE
-void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
-{
-    ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary template definition used for partially specializing
-/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
-/// corner to sample position, and test for coverage
-/// @tparam sampleCount: multisample count
-template <typename NumSamplesT>
-INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3],
-                            const __m256d* vEdgeFix16,
-                            int32_t&       mask0,
-                            int32_t&       mask1,
-                            int32_t&       mask2)
-{
-    __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
-    // evaluate edge equations at the tile multisample bounding box
-    vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
-    vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
-    vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
-    mask0            = _mm256_movemask_pd(vSampleBboxTest0);
-    mask1            = _mm256_movemask_pd(vSampleBboxTest1);
-    mask2            = _mm256_movemask_pd(vSampleBboxTest2);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
-/// when only rasterizing a single coverage test point
-template <>
-INLINE void UpdateEdgeMasks<SingleSampleT>(
-    const __m256d (&)[3], const __m256d* vEdgeFix16, int32_t& mask0, int32_t& mask1, int32_t& mask2)
-{
-    mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
-    mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
-    mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct ComputeScissorEdges
-/// @brief Primary template definition. Allows the function to be generically
-/// called. When paired with below specializations, will result in an empty
-/// inlined function if scissor is not enabled
-/// @tparam RasterScissorEdgesT: is scissor enabled?
-/// @tparam IsConservativeT: is conservative rast enabled?
-/// @tparam RT: rasterizer traits
-template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
-struct ComputeScissorEdges
-{
-    INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
-                               const SWR_RECT& scissorBBox,
-                               const int32_t   x,
-                               const int32_t   y,
-                               EDGE (&rastEdges)[RT::NumEdgesT::value],
-                               __m256d (&vEdgeFix16)[7]){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
-/// specialization. Instantiated when conservative rast and scissor are enabled
-template <typename RT>
-struct ComputeScissorEdges<std::true_type, std::true_type, RT>
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
-    /// evaluate edge equations and offset them away from pixel center.
-    INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
-                               const SWR_RECT& scissorBBox,
-                               const int32_t   x,
-                               const int32_t   y,
-                               EDGE (&rastEdges)[RT::NumEdgesT::value],
-                               __m256d (&vEdgeFix16)[7])
-    {
-        // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
-        SWR_RECT scissor;
-        scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
-        scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
-        scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
-        scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
-
-        POS topLeft{scissor.xmin, scissor.ymin};
-        POS bottomLeft{scissor.xmin, scissor.ymax};
-        POS topRight{scissor.xmax, scissor.ymin};
-        POS bottomRight{scissor.xmax, scissor.ymax};
-
-        // construct 4 scissor edges in ccw direction
-        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
-        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
-        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
-        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
-
-        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
-                                       (rastEdges[3].b * (y - scissor.ymin)));
-        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
-                                       (rastEdges[4].b * (y - scissor.ymax)));
-        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
-                                       (rastEdges[5].b * (y - scissor.ymax)));
-        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
-                                       (rastEdges[6].b * (y - scissor.ymin)));
-
-        // if conservative rasterizing, need to bump the scissor edges out by the conservative
-        // uncertainty distance, else do nothing
-        adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
-        adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
-        adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
-        adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
-
-        // Upper left rule for scissor
-        vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
-        vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
-/// specialization. Instantiated when scissor is enabled and conservative rast
-/// is disabled.
-template <typename RT>
-struct ComputeScissorEdges<std::true_type, std::false_type, RT>
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Compute scissor edge vectors and evaluate edge equations
-    INLINE ComputeScissorEdges(const SWR_RECT&,
-                               const SWR_RECT& scissorBBox,
-                               const int32_t   x,
-                               const int32_t   y,
-                               EDGE (&rastEdges)[RT::NumEdgesT::value],
-                               __m256d (&vEdgeFix16)[7])
-    {
-        const SWR_RECT& scissor = scissorBBox;
-        POS             topLeft{scissor.xmin, scissor.ymin};
-        POS             bottomLeft{scissor.xmin, scissor.ymax};
-        POS             topRight{scissor.xmax, scissor.ymin};
-        POS             bottomRight{scissor.xmax, scissor.ymax};
-
-        // construct 4 scissor edges in ccw direction
-        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
-        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
-        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
-        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
-
-        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
-                                       (rastEdges[3].b * (y - scissor.ymin)));
-        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
-                                       (rastEdges[4].b * (y - scissor.ymax)));
-        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
-                                       (rastEdges[5].b * (y - scissor.ymax)));
-        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
-                                       (rastEdges[6].b * (y - scissor.ymin)));
-
-        // Upper left rule for scissor
-        vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
-        vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for TrivialRejectTest. Should
-/// never be called, but TemplateUnroller instantiates a few unused values,
-/// so it calls a runtime assert instead of a static_assert.
-template <typename ValidEdgeMaskT>
-INLINE bool TrivialRejectTest(const int, const int, const int)
-{
-    SWR_INVALID("Primary templated function should never be called");
-    return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
-/// and edge 1 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
-{
-    return (!(mask0 && mask1)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
-/// and edge 2 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
-{
-    return (!(mask0 && mask2)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
-/// and edge 2 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
-{
-    return (!(mask1 && mask2)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
-/// primitive edges for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
-{
-    return (!(mask0 && mask1 && mask2)) ? true : false;
-    ;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
-/// point, so return false and rasterize against conservative BBox
-template <>
-INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
-{
-    return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for TrivialAcceptTest. Always returns
-/// false, since it will only be called for degenerate tris, and as such
-/// will never cover the entire raster tile
-template <typename ScissorEnableT>
-INLINE bool TrivialAcceptTest(const int, const int, const int)
-{
-    return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
-/// edge masks for a fully covered raster tile
-template <>
-INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
-{
-    return ((mask0 & mask1 & mask2) == 0xf);
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for GenerateSVInnerCoverage. Results
-/// in an empty function call if SVInnerCoverage isn't requested
-template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
-struct GenerateSVInnerCoverage
-{
-    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t&){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of GenerateSVInnerCoverage where all edges
-/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
-/// edge values from OuterConservative to InnerConservative and rasterizes.
-template <typename RT>
-struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
-{
-    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC,
-                                   uint32_t      workerId,
-                                   EDGE*         pRastEdges,
-                                   double*       pStartQuadEdges,
-                                   uint64_t&     innerCoverageMask)
-    {
-        double startQuadEdgesAdj[RT::NumEdgesT::value];
-        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-        {
-            startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(
-                pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
-        }
-
-        // not trivial accept or reject, must rasterize full tile
-        RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizePartial, pDC->drawId);
-        innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
-            pDC, startQuadEdgesAdj, pRastEdges);
-        RDTSC_END(pDC->pContext->pBucketMgr, BERasterizePartial, 0);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
-/// in an empty function call if SVInnerCoverage isn't requested
-template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
-struct UpdateEdgeMasksInnerConservative
-{
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
-                                            const __m256d*,
-                                            const __m128i,
-                                            const __m128i,
-                                            int32_t&,
-                                            int32_t&,
-                                            int32_t&){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
-/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
-/// evaluated at raster tile corners to inner conservative position and
-/// updates edge masks
-template <typename RT>
-struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
-{
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
-                                            const __m256d* vEdgeFix16,
-                                            const __m128i  vAi,
-                                            const __m128i  vBi,
-                                            int32_t&       mask0,
-                                            int32_t&       mask1,
-                                            int32_t&       mask2)
-    {
-        __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
-
-        // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
-        // conservative evaluated edge when adjusting the edge in for inner conservative tests
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
-            vAi, vBi, vTempEdge[0]);
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
-            vAi, vBi, vTempEdge[1]);
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
-            vAi, vBi, vTempEdge[2]);
-
-        UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(
-            vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
-/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
-/// cover an entire raster tile, set mask0 to 0 to force it down the
-/// rastierizePartialTile path
-template <typename RT, typename ValidEdgeMaskT>
-struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
-{
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3],
-                                            const __m256d*,
-                                            const __m128i,
-                                            const __m128i,
-                                            int32_t& mask0,
-                                            int32_t&,
-                                            int32_t&)
-    {
-        // set one mask to zero to force the triangle down the rastierizePartialTile path
-        mask0 = 0;
-    }
-};
-
-template <typename RT>
-void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
-{
-    const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
-#if KNOB_ENABLE_TOSS_POINTS
-    if (KNOB_TOSS_BIN_TRIS)
-    {
-        return;
-    }
-#endif
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeTriangle, pDC->drawId);
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BETriangleSetup, pDC->drawId);
-
-    const API_STATE&     state        = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState    = state.rastState;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
-
-    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
-    triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
-
-    __m128 vX, vY, vZ, vRecipW;
-
-    // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
-    // eg: vX = [x0 x1 x2 dc]
-    vX      = _mm_load_ps(workDesc.pTriBuffer);
-    vY      = _mm_load_ps(workDesc.pTriBuffer + 4);
-    vZ      = _mm_load_ps(workDesc.pTriBuffer + 8);
-    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
-
-    // convert to fixed point
-    static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value,
-                  "Rasterizer expects 16.8 fixed point precision");
-    __m128i vXi = fpToFixedPoint(vX);
-    __m128i vYi = fpToFixedPoint(vY);
-
-    // quantize floating point position to fixed point precision
-    // to prevent attribute creep around the triangle vertices
-    vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
-    vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
-
-    // triangle setup - A and B edge equation coefs
-    __m128 vA, vB;
-    triangleSetupAB(vX, vY, vA, vB);
-
-    __m128i vAi, vBi;
-    triangleSetupABInt(vXi, vYi, vAi, vBi);
-
-    // determinant
-    float det = calcDeterminantInt(vAi, vBi);
-
-    // Verts in Pixel Coordinate Space at this point
-    // Det > 0 = CW winding order
-    // Convert CW triangles to CCW
-    if (det > 0.0)
-    {
-        vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
-        vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
-        vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
-        vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
-        det = -det;
-    }
-
-    __m128 vC;
-    // Finish triangle setup - C edge coef
-    triangleSetupC(vX, vY, vA, vB, vC);
-
-    if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
-    {
-        // If we have degenerate edge(s) to rasterize, set I and J coefs
-        // to 0 for constant interpolation of attributes
-        triDesc.I[0] = 0.0f;
-        triDesc.I[1] = 0.0f;
-        triDesc.I[2] = 0.0f;
-        triDesc.J[0] = 0.0f;
-        triDesc.J[1] = 0.0f;
-        triDesc.J[2] = 0.0f;
-
-        // Degenerate triangles have no area
-        triDesc.recipDet = 0.0f;
-    }
-    else
-    {
-        // only extract coefs for 2 of the barycentrics; the 3rd can be
-        // determined from the barycentric equation:
-        // i + j + k = 1 <=> k = 1 - j - i
-        _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
-        _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
-        _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
-        _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
-        _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
-        _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
-
-        // compute recipDet, used to calculate barycentric i and j in the backend
-        triDesc.recipDet = 1.0f / det;
-    }
-
-    OSALIGNSIMD(float) oneOverW[4];
-    _mm_store_ps(oneOverW, vRecipW);
-    triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
-    triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
-    triDesc.OneOverW[2] = oneOverW[2];
-
-    // calculate perspective correct coefs per vertex attrib
-    float* pPerspAttribs  = perspAttribsTLS;
-    float* pAttribs       = workDesc.pAttribs;
-    triDesc.pPerspAttribs = pPerspAttribs;
-    triDesc.pAttribs      = pAttribs;
-    float* pRecipW        = workDesc.pTriBuffer + 12;
-    triDesc.pRecipW       = pRecipW;
-    __m128 vOneOverWV0    = _mm_broadcast_ss(pRecipW);
-    __m128 vOneOverWV1    = _mm_broadcast_ss(pRecipW += 1);
-    __m128 vOneOverWV2    = _mm_broadcast_ss(pRecipW += 1);
-    for (uint32_t i = 0; i < workDesc.numAttribs; i++)
-    {
-        __m128 attribA = _mm_load_ps(pAttribs);
-        __m128 attribB = _mm_load_ps(pAttribs += 4);
-        __m128 attribC = _mm_load_ps(pAttribs += 4);
-        pAttribs += 4;
-
-        attribA = _mm_mul_ps(attribA, vOneOverWV0);
-        attribB = _mm_mul_ps(attribB, vOneOverWV1);
-        attribC = _mm_mul_ps(attribC, vOneOverWV2);
-
-        _mm_store_ps(pPerspAttribs, attribA);
-        _mm_store_ps(pPerspAttribs += 4, attribB);
-        _mm_store_ps(pPerspAttribs += 4, attribC);
-        pPerspAttribs += 4;
-    }
-
-    // compute bary Z
-    // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
-    OSALIGNSIMD(float) a[4];
-    _mm_store_ps(a, vZ);
-    triDesc.Z[0] = a[0] - a[2];
-    triDesc.Z[1] = a[1] - a[2];
-    triDesc.Z[2] = a[2];
-
-    // add depth bias
-    triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
-
-    // Calc bounding box of triangle
-    OSALIGNSIMD(SWR_RECT) bbox;
-    calcBoundingBoxInt(vXi, vYi, bbox);
-
-    const SWR_RECT& scissorInFixedPoint =
-        state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
-
-    if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
-    {
-        // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is
-        // valid
-        bbox.xmin--;
-        bbox.xmax++;
-        bbox.ymin--;
-        bbox.ymax++;
-        SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
-                   "Conservative rast degenerate handling requires a valid scissor rect");
-    }
-
-    // Intersect with scissor/viewport
-    OSALIGNSIMD(SWR_RECT) intersect;
-    intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
-    intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
-    intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
-    intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
-
-    triDesc.triFlags = workDesc.triFlags;
-
-    // further constrain backend to intersecting bounding box of macro tile and scissored triangle
-    // bbox
-    uint32_t macroX, macroY;
-    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
-    int32_t macroBoxLeft   = macroX * KNOB_MACROTILE_X_DIM_FIXED;
-    int32_t macroBoxRight  = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
-    int32_t macroBoxTop    = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
-    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
-
-    intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
-    intersect.ymin = std::max(intersect.ymin, macroBoxTop);
-    intersect.xmax = std::min(intersect.xmax, macroBoxRight);
-    intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
-
-    SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax &&
-               intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 &&
-               intersect.ymax >= 0);
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BETriangleSetup, 0);
-
-    // update triangle desc
-    uint32_t minTileX  = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t minTileY  = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t maxTileX  = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t maxTileY  = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t numTilesX = maxTileX - minTileX + 1;
-    uint32_t numTilesY = maxTileY - minTileY + 1;
-
-    if (numTilesX == 0 || numTilesY == 0)
-    {
-        RDTSC_EVENT(pDC->pContext->pBucketMgr, BEEmptyTriangle, 1, 0);
-        RDTSC_END(pDC->pContext->pBucketMgr, BERasterizeTriangle, 1);
-        return;
-    }
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStepSetup, pDC->drawId);
-
-    // Step to pixel center of top-left pixel of the triangle bbox
-    // Align intersect bbox (top/left) to raster tile's (top/left).
-    int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
-    int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
-
-    // convenience typedef
-    typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
-
-    // single sample rasterization evaluates edges at pixel center,
-    // multisample evaluates edges UL pixel corner and steps to each sample position
-    if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
-    {
-        // Add 0.5, in fixed point, to offset to pixel center
-        x += (FIXED_POINT_SCALE / 2);
-        y += (FIXED_POINT_SCALE / 2);
-    }
-
-    __m128i vTopLeftX = _mm_set1_epi32(x);
-    __m128i vTopLeftY = _mm_set1_epi32(y);
-
-    // evaluate edge equations at top-left pixel using 64bit math
-    //
-    // line = Ax + By + C
-    // solving for C:
-    // C = -Ax - By
-    // we know x0 and y0 are on the line; plug them in:
-    // C = -Ax0 - By0
-    // plug C back into line equation:
-    // line = Ax - By - Ax0 - By0
-    // line = A(x - x0) + B(y - y0)
-    // dX = (x-x0), dY = (y-y0)
-    // so all this simplifies to
-    // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
-
-    __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
-    __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
-
-    // evaluate A(dx) and B(dY) for all points
-    __m256d vAipd     = _mm256_cvtepi32_pd(vAi);
-    __m256d vBipd     = _mm256_cvtepi32_pd(vBi);
-    __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
-    __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
-
-    __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
-    __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
-    __m256d vEdge          = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
-
-    // apply any edge adjustments(top-left, crast, etc)
-    adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
-
-    // broadcast respective edge results to all lanes
-    double* pEdge = (double*)&vEdge;
-    __m256d vEdgeFix16[7];
-    vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
-    vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
-    vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
-
-    OSALIGNSIMD(int32_t) aAi[4], aBi[4];
-    _mm_store_si128((__m128i*)aAi, vAi);
-    _mm_store_si128((__m128i*)aBi, vBi);
-    EDGE rastEdges[RT::NumEdgesT::value];
-
-    // Compute and store triangle edge data
-    ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
-    ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
-    ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
-
-    // Compute and store triangle edge data if scissor needs to rasterized
-    ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>(
-        bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
-
-    // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
-    // used to for testing if entire raster tile is inside a triangle
-    for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-    {
-        vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
-    }
-
-    // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
-    // step sample positions to the raster tile bbox of multisample points
-    // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
-    //                             |      |
-    //                             |      |
-    // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
-    __m256d vEdgeTileBbox[3];
-    if (NumCoverageSamplesT::value > 1)
-    {
-        const SWR_MULTISAMPLE_POS& samplePos         = rastState.samplePositions;
-        const __m128i              vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
-        const __m128i              vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
-
-        __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
-        __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
-
-        // step edge equation tests from Tile
-        // used to for testing if entire raster tile is inside a triangle
-        for (uint32_t e = 0; e < 3; ++e)
-        {
-            __m256d vResultAxFix16 =
-                _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
-            __m256d vResultByFix16 =
-                _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
-            vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
-            // adjust for msaa tile bbox edges outward for conservative rast, if enabled
-            adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(
-                vAi, vBi, vEdgeTileBbox[e]);
-        }
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BEStepSetup, 0);
-
-    uint32_t tY   = minTileY;
-    uint32_t tX   = minTileX;
-    uint32_t maxY = maxTileY;
-    uint32_t maxX = maxTileX;
-
-    RenderOutputBuffers renderBuffers, currentRenderBufferRow;
-    GetRenderHotTiles<RT::MT::numSamples>(pDC,
-                                          workerId,
-                                          macroTile,
-                                          minTileX,
-                                          minTileY,
-                                          renderBuffers,
-                                          triDesc.triFlags.renderTargetArrayIndex);
-    currentRenderBufferRow = renderBuffers;
-
-    // rasterize and generate coverage masks per sample
-    for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
-    {
-        __m256d vStartOfRowEdge[RT::NumEdgesT::value];
-        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-        {
-            vStartOfRowEdge[e] = vEdgeFix16[e];
-        }
-
-        for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
-        {
-            triDesc.anyCoveredSamples = 0;
-
-            // is the corner of the edge outside of the raster tile? (vEdge < 0)
-            int mask0, mask1, mask2;
-            UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
-
-            for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
-            {
-                // trivial reject, at least one edge has all 4 corners of raster tile outside
-                bool trivialReject =
-                    TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
-
-                if (!trivialReject)
-                {
-                    // trivial accept mask
-                    triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
-
-                    // Update the raster tile edge masks based on inner conservative edge offsets,
-                    // if enabled
-                    UpdateEdgeMasksInnerConservative<RT,
-                                                     typename RT::ValidEdgeMaskT,
-                                                     typename RT::InputCoverageT>(
-                        vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
-
-                    // @todo Make this a bit smarter to allow use of trivial accept when:
-                    //   1) scissor/vp intersection rect is raster tile aligned
-                    //   2) raster tile is entirely within scissor/vp intersection rect
-                    if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
-                    {
-                        // trivial accept, all 4 corners of all 3 edges are negative
-                        // i.e. raster tile completely inside triangle
-                        triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
-                        if (std::is_same<typename RT::InputCoverageT,
-                                         InnerConservativeCoverageT>::value)
-                        {
-                            triDesc.innerCoverageMask = 0xffffffffffffffffULL;
-                        }
-                        RDTSC_EVENT(pDC->pContext->pBucketMgr, BETrivialAccept, 1, 0);
-                    }
-                    else
-                    {
-                        __m256d vEdgeAtSample[RT::NumEdgesT::value];
-                        if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
-                        {
-                            // should get optimized out for single sample case (global value
-                            // numbering or copy propagation)
-                            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-                            {
-                                vEdgeAtSample[e] = vEdgeFix16[e];
-                            }
-                        }
-                        else
-                        {
-                            const SWR_MULTISAMPLE_POS& samplePos       = rastState.samplePositions;
-                            __m128i                    vSampleOffsetXh = samplePos.vXi(sampleNum);
-                            __m128i                    vSampleOffsetYh = samplePos.vYi(sampleNum);
-                            __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
-                            __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
-
-                            // step edge equation tests from UL tile corner to pixel sample position
-                            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-                            {
-                                __m256d vResultAxFix16 =
-                                    _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
-                                __m256d vResultByFix16 =
-                                    _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
-                                vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                                vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
-                            }
-                        }
-
-                        double        startQuadEdges[RT::NumEdgesT::value];
-                        const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-                        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-                        {
-                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
-                        }
-
-                        // not trivial accept or reject, must rasterize full tile
-                        RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizePartial, pDC->drawId);
-                        triDesc.coverageMask[sampleNum] =
-                            rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
-                                pDC, startQuadEdges, rastEdges);
-                        RDTSC_END(pDC->pContext->pBucketMgr, BERasterizePartial, 0);
-
-                        triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
-
-                        // Output SV InnerCoverage, if needed
-                        GenerateSVInnerCoverage<RT,
-                                                typename RT::ValidEdgeMaskT,
-                                                typename RT::InputCoverageT>(
-                            pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
-                    }
-                }
-                else
-                {
-                    // if we're calculating coverage per sample, need to store it off. otherwise no
-                    // covered samples, don't need to do anything
-                    if (NumCoverageSamplesT::value > 1)
-                    {
-                        triDesc.coverageMask[sampleNum] = 0;
-                    }
-                    RDTSC_EVENT(pDC->pContext->pBucketMgr, BETrivialReject, 1, 0);
-                }
-            }
-
-#if KNOB_ENABLE_TOSS_POINTS
-            if (KNOB_TOSS_RS)
-            {
-                gToss = triDesc.coverageMask[0];
-            }
-            else
-#endif
-                if (triDesc.anyCoveredSamples)
-            {
-                // if conservative rast and MSAA are enabled, conservative coverage for a pixel
-                // means all samples in that pixel are covered copy conservative coverage result to
-                // all samples
-                if (RT::IsConservativeT::value)
-                {
-                    auto copyCoverage = [&](int sample) {
-                        triDesc.coverageMask[sample] = triDesc.coverageMask[0];
-                    };
-                    UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
-                }
-
-                // Track rasterized subspans
-                AR_EVENT(RasterTileCount(pDC->drawId, 1));
-
-                RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId);
-                backendFuncs.pfnBackend(pDC,
-                                        workerId,
-                                        tileX << KNOB_TILE_X_DIM_SHIFT,
-                                        tileY << KNOB_TILE_Y_DIM_SHIFT,
-                                        triDesc,
-                                        renderBuffers);
-                RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0);
-            }
-
-            // step to the next tile in X
-            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-            {
-                vEdgeFix16[e] =
-                    _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
-            }
-            StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
-        }
-
-        // step to the next tile in Y
-        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-        {
-            vEdgeFix16[e] =
-                _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
-        }
-        StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BERasterizeTriangle, 1);
-}
-
-// Get pointers to hot tile memory for color RT, depth, stencil
-template <uint32_t numSamples>
-void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
-                       uint32_t             workerId,
-                       uint32_t             macroID,
-                       uint32_t             tileX,
-                       uint32_t             tileY,
-                       RenderOutputBuffers& renderBuffers,
-                       uint32_t             renderTargetArrayIndex)
-{
-    const API_STATE& state    = GetApiState(pDC);
-    SWR_CONTEXT*     pContext = pDC->pContext;
-    HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    uint32_t mx, my;
-    MacroTileMgr::getTileIndices(macroID, mx, my);
-    tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
-    tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
-
-    // compute tile offset for active hottile buffers
-    const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
-    uint32_t       offset = ComputeTileOffset2D<
-        TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp>>(
-        pitch, tileX, tileY);
-    offset *= numSamples;
-
-    unsigned long rtSlot                 = 0;
-    uint32_t      colorHottileEnableMask = state.colorHottileEnable;
-    while (_BitScanForward(&rtSlot, colorHottileEnableMask))
-    {
-        HOTTILE* pColor = pContext->pHotTileMgr->GetHotTile(
-            pContext,
-            pDC,
-            hWorkerPrivateData,
-            macroID,
-            (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
-            true,
-            numSamples,
-            renderTargetArrayIndex);
-        renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
-        renderBuffers.pColorHotTile[rtSlot] = pColor;
-
-        colorHottileEnableMask &= ~(1 << rtSlot);
-    }
-    if (state.depthHottileEnable)
-    {
-        const uint32_t pitch =
-            KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
-        uint32_t offset = ComputeTileOffset2D<
-            TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp>>(
-            pitch, tileX, tileY);
-        offset *= numSamples;
-        HOTTILE* pDepth = pContext->pHotTileMgr->GetHotTile(pContext,
-                                                            pDC,
-                                                            hWorkerPrivateData,
-                                                            macroID,
-                                                            SWR_ATTACHMENT_DEPTH,
-                                                            true,
-                                                            numSamples,
-                                                            renderTargetArrayIndex);
-        pDepth->state   = HOTTILE_DIRTY;
-        SWR_ASSERT(pDepth->pBuffer != nullptr);
-        renderBuffers.pDepth = pDepth->pBuffer + offset;
-        renderBuffers.pDepthHotTile = pDepth;
-    }
-    if (state.stencilHottileEnable)
-    {
-        const uint32_t pitch =
-            KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
-        uint32_t offset = ComputeTileOffset2D<
-            TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp>>(
-            pitch, tileX, tileY);
-        offset *= numSamples;
-        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext,
-                                                              pDC,
-                                                              hWorkerPrivateData,
-                                                              macroID,
-                                                              SWR_ATTACHMENT_STENCIL,
-                                                              true,
-                                                              numSamples,
-                                                              renderTargetArrayIndex);
-        pStencil->state   = HOTTILE_DIRTY;
-        SWR_ASSERT(pStencil->pBuffer != nullptr);
-        renderBuffers.pStencil = pStencil->pBuffer + offset;
-        renderBuffers.pStencilHotTile = pStencil;
-    }
-}
-
-template <typename RT>
-INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers)
-{
-    unsigned long rt = 0;
-    while (_BitScanForward(&rt, colorHotTileMask))
-    {
-        colorHotTileMask &= ~(1 << rt);
-        buffers.pColor[rt] += RT::colorRasterTileStep;
-    }
-
-    buffers.pDepth += RT::depthRasterTileStep;
-    buffers.pStencil += RT::stencilRasterTileStep;
-}
-
-template <typename RT>
-INLINE void StepRasterTileY(uint32_t             colorHotTileMask,
-                            RenderOutputBuffers& buffers,
-                            RenderOutputBuffers& startBufferRow)
-{
-    unsigned long rt = 0;
-    while (_BitScanForward(&rt, colorHotTileMask))
-    {
-        colorHotTileMask &= ~(1 << rt);
-        startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
-        buffers.pColor[rt] = startBufferRow.pColor[rt];
-    }
-    startBufferRow.pDepth += RT::depthRasterTileRowStep;
-    buffers.pDepth = startBufferRow.pDepth;
-
-    startBufferRow.pStencil += RT::stencilRasterTileRowStep;
-    buffers.pStencil = startBufferRow.pStencil;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
deleted file mode 100644
index 6329b2ec98e..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#include "rdtsc_core.h"
-#include "common/rdtsc_buckets.h"
-
-// must match CORE_BUCKETS enum order
-BUCKET_DESC gCoreBuckets[] = {
-    {"APIClearRenderTarget", "", true, 0xff0b8bea},
-    {"APIDraw", "", true, 0xff000066},
-    {"APIDrawWakeAllThreads", "", false, 0xffffffff},
-    {"APIDrawIndexed", "", true, 0xff000066},
-    {"APIDispatch", "", true, 0xff660000},
-    {"APIStoreTiles", "", true, 0xff00ffff},
-    {"APIGetDrawContext", "", false, 0xffffffff},
-    {"APISync", "", true, 0xff6666ff},
-    {"APIWaitForIdle", "", true, 0xff0000ff},
-    {"FEProcessDraw", "", true, 0xff009900},
-    {"FEProcessDrawIndexed", "", true, 0xff009900},
-    {"FEFetchShader", "", false, 0xffffffff},
-    {"FEVertexShader", "", false, 0xffffffff},
-    {"FEHullShader", "", false, 0xffffffff},
-    {"FETessellation", "", false, 0xffffffff},
-    {"FEDomainShader", "", false, 0xffffffff},
-    {"FEGeometryShader", "", false, 0xffffffff},
-    {"FEStreamout", "", false, 0xffffffff},
-    {"FEPAAssemble", "", false, 0xffffffff},
-    {"FEBinPoints", "", false, 0xff29b854},
-    {"FEBinLines", "", false, 0xff29b854},
-    {"FEBinTriangles", "", false, 0xff29b854},
-    {"FETriangleSetup", "", false, 0xffffffff},
-    {"FEViewportCull", "", false, 0xffffffff},
-    {"FEGuardbandClip", "", false, 0xffffffff},
-    {"FEClipPoints", "", false, 0xffffffff},
-    {"FEClipLines", "", false, 0xffffffff},
-    {"FEClipTriangles", "", false, 0xffffffff},
-    {"FEClipRectangles", "", false, 0xffffffff},
-    {"FECullZeroAreaAndBackface", "", false, 0xffffffff},
-    {"FECullBetweenCenters", "", false, 0xffffffff},
-    {"FEEarlyRastEnter", "", false, 0xffffffff},
-    {"FEEarlyRastExit", "", false, 0xffffffff},
-    {"FEProcessStoreTiles", "", true, 0xff39c864},
-    {"FEProcessInvalidateTiles", "", true, 0xffffffff},
-    {"WorkerWorkOnFifoBE", "", false, 0xff40261c},
-    {"WorkerFoundWork", "", false, 0xff573326},
-    {"BELoadTiles", "", true, 0xffb0e2ff},
-    {"BEDispatch", "", true, 0xff00a2ff},
-    {"BEClear", "", true, 0xff00ccbb},
-    {"BERasterizeLine", "", true, 0xffb26a4e},
-    {"BERasterizeTriangle", "", true, 0xffb26a4e},
-    {"BETriangleSetup", "", false, 0xffffffff},
-    {"BEStepSetup", "", false, 0xffffffff},
-    {"BECullZeroArea", "", false, 0xffffffff},
-    {"BEEmptyTriangle", "", false, 0xffffffff},
-    {"BETrivialAccept", "", false, 0xffffffff},
-    {"BETrivialReject", "", false, 0xffffffff},
-    {"BERasterizePartial", "", false, 0xffffffff},
-    {"BEPixelBackend", "", false, 0xffffffff},
-    {"BESetup", "", false, 0xffffffff},
-    {"BEBarycentric", "", false, 0xffffffff},
-    {"BEEarlyDepthTest", "", false, 0xffffffff},
-    {"BEPixelShader", "", false, 0xffffffff},
-    {"BESingleSampleBackend", "", false, 0xffffffff},
-    {"BEPixelRateBackend", "", false, 0xffffffff},
-    {"BESampleRateBackend", "", false, 0xffffffff},
-    {"BENullBackend", "", false, 0xffffffff},
-    {"BELateDepthTest", "", false, 0xffffffff},
-    {"BEOutputMerger", "", false, 0xffffffff},
-    {"BEStoreTiles", "", true, 0xff00cccc},
-    {"BEEndTile", "", false, 0xffffffff},
-};
-static_assert(NumBuckets == (sizeof(gCoreBuckets) / sizeof(gCoreBuckets[0])),
-              "RDTSC Bucket enum and description table size mismatched.");
-
diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
deleted file mode 100644
index 0228275bd47..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#pragma once
-#include "knobs.h"
-
-#include "common/os.h"
-#include "common/rdtsc_buckets.h"
-
-#include <vector>
-
-///////////////////////////////////////////////////////////////////////////////
-// NOTE:  This enum MUST be kept in sync with gCoreBuckets in rdtsc_core.cpp
-///////////////////////////////////////////////////////////////////////////////
-enum CORE_BUCKETS
-{
-    APIClearRenderTarget,
-    APIDraw,
-    APIDrawWakeAllThreads,
-    APIDrawIndexed,
-    APIDispatch,
-    APIStoreTiles,
-    APIGetDrawContext,
-    APISync,
-    APIWaitForIdle,
-    FEProcessDraw,
-    FEProcessDrawIndexed,
-    FEFetchShader,
-    FEVertexShader,
-    FEHullShader,
-    FETessellation,
-    FEDomainShader,
-    FEGeometryShader,
-    FEStreamout,
-    FEPAAssemble,
-    FEBinPoints,
-    FEBinLines,
-    FEBinTriangles,
-    FETriangleSetup,
-    FEViewportCull,
-    FEGuardbandClip,
-    FEClipPoints,
-    FEClipLines,
-    FEClipTriangles,
-    FEClipRectangles,
-    FECullZeroAreaAndBackface,
-    FECullBetweenCenters,
-    FEEarlyRastEnter,
-    FEEarlyRastExit,
-    FEProcessStoreTiles,
-    FEProcessInvalidateTiles,
-    WorkerWorkOnFifoBE,
-    WorkerFoundWork,
-    BELoadTiles,
-    BEDispatch,
-    BEClear,
-    BERasterizeLine,
-    BERasterizeTriangle,
-    BETriangleSetup,
-    BEStepSetup,
-    BECullZeroArea,
-    BEEmptyTriangle,
-    BETrivialAccept,
-    BETrivialReject,
-    BERasterizePartial,
-    BEPixelBackend,
-    BESetup,
-    BEBarycentric,
-    BEEarlyDepthTest,
-    BEPixelShader,
-    BESingleSampleBackend,
-    BEPixelRateBackend,
-    BESampleRateBackend,
-    BENullBackend,
-    BELateDepthTest,
-    BEOutputMerger,
-    BEStoreTiles,
-    BEEndTile,
-
-    NumBuckets
-};
-
-void rdtscReset(BucketManager* pBucketMgr);
-void rdtscInit(BucketManager* pBucketMgr, int threadId);
-void rdtscStart(BucketManager* pBucketMgr, uint32_t bucketId);
-void rdtscStop(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count, uint64_t drawId);
-void rdtscEvent(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count1, uint32_t count2);
-void rdtscEndFrame(BucketManager* pBucketMgr);
-
-#ifdef KNOB_ENABLE_RDTSC
-#define RDTSC_RESET(pBucketMgr) rdtscReset(pBucketMgr)
-#define RDTSC_INIT(pBucketMgr, threadId) rdtscInit(pBucketMgr,threadId)
-#define RDTSC_START(pBucketMgr, bucket) rdtscStart(pBucketMgr, bucket)
-#define RDTSC_STOP(pBucketMgr, bucket, count, draw) rdtscStop(pBucketMgr, bucket, count, draw)
-#define RDTSC_EVENT(pBucketMgr, bucket, count1, count2) rdtscEvent(pBucketMgr, bucket, count1, count2)
-#define RDTSC_ENDFRAME(pBucketMgr) rdtscEndFrame(pBucketMgr)
-#else
-#define RDTSC_RESET(pBucketMgr)
-#define RDTSC_INIT(pBucketMgr, threadId)
-#define RDTSC_START(pBucketMgr, bucket)
-#define RDTSC_STOP(pBucketMgr, bucket, count, draw)
-#define RDTSC_EVENT(pBucketMgr, bucket, count1, count2)
-#define RDTSC_ENDFRAME(pBucketMgr)
-#endif
-
-extern BUCKET_DESC           gCoreBuckets[];
-
-INLINE void rdtscReset(BucketManager *pBucketMgr)
-{
-    pBucketMgr->mCurrentFrame = 0;
-    pBucketMgr->ClearThreads();
-}
-
-INLINE void rdtscInit(BucketManager* pBucketMgr, int threadId)
-{
-    // register all the buckets once
-    if (!pBucketMgr->mBucketsInitialized && (threadId == 0))
-    {
-        pBucketMgr->mBucketMap.resize(NumBuckets);
-        for (uint32_t i = 0; i < NumBuckets; ++i)
-        {
-            pBucketMgr->mBucketMap[i] = pBucketMgr->RegisterBucket(gCoreBuckets[i]);
-        }
-        pBucketMgr->mBucketsInitialized = true;
-    }
-
-    std::string name = threadId == 0 ? "API" : "WORKER";
-    pBucketMgr->RegisterThread(name);
-}
-
-INLINE void rdtscStart(BucketManager* pBucketMgr, uint32_t bucketId)
-{
-    uint32_t id = pBucketMgr->mBucketMap[bucketId];
-    pBucketMgr->StartBucket(id);
-}
-
-INLINE void rdtscStop(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count, uint64_t drawId)
-{
-    uint32_t id = pBucketMgr->mBucketMap[bucketId];
-    pBucketMgr->StopBucket(id);
-}
-
-INLINE void rdtscEvent(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count1, uint32_t count2)
-{
-    uint32_t id = pBucketMgr->mBucketMap[bucketId];
-    pBucketMgr->AddEvent(id, count1);
-}
-
-INLINE void rdtscEndFrame(BucketManager* pBucketMgr)
-{
-    pBucketMgr->mCurrentFrame++;
-
-    if (pBucketMgr->mCurrentFrame == KNOB_BUCKETS_START_FRAME &&
-        KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
-    {
-        pBucketMgr->StartCapture();
-    }
-
-    if (pBucketMgr->mCurrentFrame == KNOB_BUCKETS_END_FRAME &&
-        KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
-    {
-        pBucketMgr->StopCapture();
-        pBucketMgr->PrintReport("rdtsc.txt");
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
deleted file mode 100644
index 2e758f43753..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file arena.h
- *
- * @brief RingBuffer
- *        The RingBuffer class manages all aspects of the ring buffer including
- *        the head/tail indices, etc.
- *
- ******************************************************************************/
-#pragma once
-
-template <typename T>
-class RingBuffer
-{
-public:
-    RingBuffer() : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) {}
-
-    ~RingBuffer() { Destroy(); }
-
-    void Init(uint32_t numEntries)
-    {
-        SWR_ASSERT(numEntries > 0);
-        SWR_ASSERT(((1ULL << 32) % numEntries) == 0,
-                   "%d is not evenly divisible into 2 ^ 32.  Wrap errors will occur!",
-                   numEntries);
-        mNumEntries  = numEntries;
-        mpRingBuffer = (T*)AlignedMalloc(sizeof(T) * numEntries, 64);
-        SWR_ASSERT(mpRingBuffer != nullptr);
-        memset((void*)mpRingBuffer, 0, sizeof(T) * numEntries);
-    }
-
-    void Destroy()
-    {
-        AlignedFree(mpRingBuffer);
-        mpRingBuffer = nullptr;
-    }
-
-    T& operator[](const uint32_t index)
-    {
-        SWR_ASSERT(index < mNumEntries);
-        return mpRingBuffer[index];
-    }
-
-    INLINE void Enqueue()
-    {
-        mRingHead++; // There's only one producer.
-        // Assert to find wrap-around cases, NEVER ENABLE DURING CHECKIN!!
-        // SWR_REL_ASSERT(mRingHead);
-    }
-
-    INLINE void Dequeue()
-    {
-        InterlockedIncrement(&mRingTail); // There are multiple consumers.
-    }
-
-    INLINE bool IsEmpty() { return (GetHead() == GetTail()); }
-
-    INLINE bool IsFull()
-    {
-        uint32_t numEnqueued = GetHead() - GetTail();
-        SWR_ASSERT(numEnqueued <= mNumEntries);
-
-        return (numEnqueued == mNumEntries);
-    }
-
-    INLINE uint32_t GetTail() volatile { return mRingTail; }
-    INLINE uint32_t GetHead() volatile { return mRingHead; }
-
-protected:
-    T*       mpRingBuffer;
-    uint32_t mNumEntries;
-
-    OSALIGNLINE(volatile uint32_t) mRingHead; // Consumer Counter
-    OSALIGNLINE(volatile uint32_t) mRingTail; // Producer Counter
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
deleted file mode 100644
index 66a23bd9b08..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ /dev/null
@@ -1,1240 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file state.h
- *
- * @brief Definitions for API state.
- *
- ******************************************************************************/
-// Skipping clang-format due to parsing by simplistic python scripts
-// clang-format off
-#pragma once
-
-#include "common/formats.h"
-#include "common/intrin.h"
-#include "common/rdtsc_buckets.h"
-#include <functional>
-#include <algorithm>
-
-using gfxptr_t = unsigned long long;
-
-//////////////////////////////////////////////////////////////////////////
-/// PRIMITIVE_TOPOLOGY.
-//////////////////////////////////////////////////////////////////////////
-enum PRIMITIVE_TOPOLOGY
-{
-    TOP_UNKNOWN                = 0x0,
-    TOP_POINT_LIST             = 0x1,
-    TOP_LINE_LIST              = 0x2,
-    TOP_LINE_STRIP             = 0x3,
-    TOP_TRIANGLE_LIST          = 0x4,
-    TOP_TRIANGLE_STRIP         = 0x5,
-    TOP_TRIANGLE_FAN           = 0x6,
-    TOP_QUAD_LIST              = 0x7,
-    TOP_QUAD_STRIP             = 0x8,
-    TOP_LINE_LIST_ADJ          = 0x9,
-    TOP_LISTSTRIP_ADJ          = 0xA,
-    TOP_TRI_LIST_ADJ           = 0xB,
-    TOP_TRI_STRIP_ADJ          = 0xC,
-    TOP_TRI_STRIP_REVERSE      = 0xD,
-    TOP_POLYGON                = 0xE,
-    TOP_RECT_LIST              = 0xF,
-    TOP_LINE_LOOP              = 0x10,
-    TOP_POINT_LIST_BF          = 0x11,
-    TOP_LINE_STRIP_CONT        = 0x12,
-    TOP_LINE_STRIP_BF          = 0x13,
-    TOP_LINE_STRIP_CONT_BF     = 0x14,
-    TOP_TRIANGLE_FAN_NOSTIPPLE = 0x16,
-    TOP_TRIANGLE_DISC          = 0x17, /// @todo What is this??
-
-    TOP_PATCHLIST_BASE = 0x1F, // Invalid topology, used to calculate num verts for a patchlist.
-    TOP_PATCHLIST_1    = 0x20, // List of 1-vertex patches
-    TOP_PATCHLIST_2    = 0x21,
-    TOP_PATCHLIST_3    = 0x22,
-    TOP_PATCHLIST_4    = 0x23,
-    TOP_PATCHLIST_5    = 0x24,
-    TOP_PATCHLIST_6    = 0x25,
-    TOP_PATCHLIST_7    = 0x26,
-    TOP_PATCHLIST_8    = 0x27,
-    TOP_PATCHLIST_9    = 0x28,
-    TOP_PATCHLIST_10   = 0x29,
-    TOP_PATCHLIST_11   = 0x2A,
-    TOP_PATCHLIST_12   = 0x2B,
-    TOP_PATCHLIST_13   = 0x2C,
-    TOP_PATCHLIST_14   = 0x2D,
-    TOP_PATCHLIST_15   = 0x2E,
-    TOP_PATCHLIST_16   = 0x2F,
-    TOP_PATCHLIST_17   = 0x30,
-    TOP_PATCHLIST_18   = 0x31,
-    TOP_PATCHLIST_19   = 0x32,
-    TOP_PATCHLIST_20   = 0x33,
-    TOP_PATCHLIST_21   = 0x34,
-    TOP_PATCHLIST_22   = 0x35,
-    TOP_PATCHLIST_23   = 0x36,
-    TOP_PATCHLIST_24   = 0x37,
-    TOP_PATCHLIST_25   = 0x38,
-    TOP_PATCHLIST_26   = 0x39,
-    TOP_PATCHLIST_27   = 0x3A,
-    TOP_PATCHLIST_28   = 0x3B,
-    TOP_PATCHLIST_29   = 0x3C,
-    TOP_PATCHLIST_30   = 0x3D,
-    TOP_PATCHLIST_31   = 0x3E,
-    TOP_PATCHLIST_32   = 0x3F, // List of 32-vertex patches
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_SHADER_TYPE
-//////////////////////////////////////////////////////////////////////////
-enum SWR_SHADER_TYPE
-{
-    SHADER_VERTEX,
-    SHADER_GEOMETRY,
-    SHADER_DOMAIN,
-    SHADER_HULL,
-    SHADER_PIXEL,
-    SHADER_COMPUTE,
-
-    NUM_SHADER_TYPES,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_RENDERTARGET_ATTACHMENT
-/// @todo Its not clear what an "attachment" means. Its not common term.
-//////////////////////////////////////////////////////////////////////////
-enum SWR_RENDERTARGET_ATTACHMENT
-{
-    SWR_ATTACHMENT_COLOR0,
-    SWR_ATTACHMENT_COLOR1,
-    SWR_ATTACHMENT_COLOR2,
-    SWR_ATTACHMENT_COLOR3,
-    SWR_ATTACHMENT_COLOR4,
-    SWR_ATTACHMENT_COLOR5,
-    SWR_ATTACHMENT_COLOR6,
-    SWR_ATTACHMENT_COLOR7,
-    SWR_ATTACHMENT_DEPTH,
-    SWR_ATTACHMENT_STENCIL,
-
-    SWR_NUM_ATTACHMENTS
-};
-
-#define SWR_NUM_RENDERTARGETS 8
-
-#define SWR_ATTACHMENT_COLOR0_BIT 0x001
-#define SWR_ATTACHMENT_COLOR1_BIT 0x002
-#define SWR_ATTACHMENT_COLOR2_BIT 0x004
-#define SWR_ATTACHMENT_COLOR3_BIT 0x008
-#define SWR_ATTACHMENT_COLOR4_BIT 0x010
-#define SWR_ATTACHMENT_COLOR5_BIT 0x020
-#define SWR_ATTACHMENT_COLOR6_BIT 0x040
-#define SWR_ATTACHMENT_COLOR7_BIT 0x080
-#define SWR_ATTACHMENT_DEPTH_BIT 0x100
-#define SWR_ATTACHMENT_STENCIL_BIT 0x200
-#define SWR_ATTACHMENT_MASK_ALL 0x3ff
-#define SWR_ATTACHMENT_MASK_COLOR 0x0ff
-
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SWR Inner Tessellation factor ID
-/// See above GetTessFactorOutputPosition code for documentation
-enum SWR_INNER_TESSFACTOR_ID
-{
-    SWR_QUAD_U_TRI_INSIDE,
-    SWR_QUAD_V_INSIDE,
-
-    SWR_NUM_INNER_TESS_FACTORS,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SWR Outer Tessellation factor ID
-/// See above GetTessFactorOutputPosition code for documentation
-enum SWR_OUTER_TESSFACTOR_ID
-{
-    SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL,
-    SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY,
-    SWR_QUAD_V_EQ0_TRI_W,
-    SWR_QUAD_V_EQ1,
-
-    SWR_NUM_OUTER_TESS_FACTORS,
-};
-
-/////////////////////////////////////////////////////////////////////////
-/// simdvertex
-/// @brief Defines a vertex element that holds all the data for SIMD vertices.
-///        Contains space for position, SGV, and 32 generic attributes
-/////////////////////////////////////////////////////////////////////////
-enum SWR_VTX_SLOTS
-{
-    VERTEX_SGV_SLOT                 = 0,
-    VERTEX_SGV_RTAI_COMP            = 0,
-    VERTEX_SGV_VAI_COMP             = 1,
-    VERTEX_SGV_POINT_SIZE_COMP      = 2,
-    VERTEX_POSITION_SLOT            = 1,
-    VERTEX_POSITION_END_SLOT        = 1,
-    VERTEX_CLIPCULL_DIST_LO_SLOT    = (1 + VERTEX_POSITION_END_SLOT), // VS writes lower 4 clip/cull dist
-    VERTEX_CLIPCULL_DIST_HI_SLOT    = (2 + VERTEX_POSITION_END_SLOT), // VS writes upper 4 clip/cull dist
-    VERTEX_ATTRIB_START_SLOT        = (3 + VERTEX_POSITION_END_SLOT),
-    VERTEX_ATTRIB_END_SLOT          = (34 + VERTEX_POSITION_END_SLOT),
-    SWR_VTX_NUM_SLOTS               = (1 + VERTEX_ATTRIB_END_SLOT)
-};
-
-// SoAoSoA
-struct simdvertex
-{
-    simdvector attrib[SWR_VTX_NUM_SLOTS];
-};
-
-struct simd16vertex
-{
-    simd16vector attrib[SWR_VTX_NUM_SLOTS];
-};
-
-template <typename SIMD_T>
-struct SIMDVERTEX_T
-{
-    typename SIMD_T::Vec4 attrib[SWR_VTX_NUM_SLOTS];
-};
-
-struct SWR_WORKER_DATA
-{
-    HANDLE hArContext;  // handle to the archrast context
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_SHADER_STATS
-/// @brief Structure passed to shader for stats collection.
-/////////////////////////////////////////////////////////////////////////
-struct SWR_SHADER_STATS
-{
-    uint32_t numInstExecuted;      // This is roughly the API instructions executed and not x86.
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_VS_CONTEXT
-/// @brief Input to vertex shader
-/////////////////////////////////////////////////////////////////////////
-struct SWR_VS_CONTEXT
-{
-    simdvertex* pVin;  // IN: SIMD input vertex data store
-    simdvertex* pVout; // OUT: SIMD output vertex data store
-
-    uint32_t    InstanceID; // IN: Instance ID, constant across all verts of the SIMD
-    simdscalari VertexID;   // IN: Vertex ID
-    simdscalari mask;       // IN: Active mask for shader
-
-    // SIMD16 Frontend fields.
-    uint32_t AlternateOffset; // IN: amount to offset for interleaving even/odd simd8 in
-                              // simd16vertex output
-    simd16scalari mask16;     // IN: Active mask for shader (16-wide)
-    simd16scalari VertexID16; // IN: Vertex ID (16-wide)
-
-    SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
-};
-
-/////////////////////////////////////////////////////////////////////////
-/// ScalarCPoint
-/// @brief defines a control point element as passed from the output
-/// of the hull shader to the input of the domain shader
-/////////////////////////////////////////////////////////////////////////
-struct ScalarAttrib
-{
-    float x;
-    float y;
-    float z;
-    float w;
-};
-
-struct ScalarCPoint
-{
-    ScalarAttrib attrib[SWR_VTX_NUM_SLOTS];
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_TESSELLATION_FACTORS
-/// @brief Tessellation factors structure (non-vector)
-/////////////////////////////////////////////////////////////////////////
-struct SWR_TESSELLATION_FACTORS
-{
-    float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS];
-    float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS];
-    float pad[2];
-};
-
-SWR_STATIC_ASSERT(sizeof(SWR_TESSELLATION_FACTORS) == 32);
-
-#define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches
-struct ScalarPatch
-{
-    SWR_TESSELLATION_FACTORS tessFactors;
-    ScalarCPoint             cp[MAX_NUM_VERTS_PER_PRIM];
-    ScalarCPoint             patchData;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_HS_CONTEXT
-/// @brief Input to hull shader
-/////////////////////////////////////////////////////////////////////////
-struct SWR_HS_CONTEXT
-{
-    simdvertex       vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data
-    simdscalari      PrimitiveID;                  // IN: (SIMD) primitive ID generated from the draw call
-    simdscalari      mask;                         // IN: Active mask for shader
-    uint32_t         outputSize;                   // IN: Size of HS output (per lane)
-    ScalarPatch*     pCPout;                       // OUT: Output control point patch SIMD-sized-array of SCALAR patches
-    SWR_SHADER_STATS stats;                        // OUT: shader statistics used for archrast.
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_DS_CONTEXT
-/// @brief Input to domain shader
-/////////////////////////////////////////////////////////////////////////
-struct SWR_DS_CONTEXT
-{
-    uint32_t        PrimitiveID;    // IN: (SCALAR) PrimitiveID for the patch associated with the DS invocation
-    uint32_t        vectorOffset;   // IN: (SCALAR) vector index offset into SIMD data.
-    uint32_t        vectorStride;   // IN: (SCALAR) stride (in vectors) of output data per attribute-component
-    uint32_t        outVertexAttribOffset; // IN: (SCALAR) Offset to the attributes as processed by the next shader stage.
-    ScalarPatch*    pCpIn;          // IN: (SCALAR) Control patch
-    simdscalar*     pDomainU;       // IN: (SIMD) Domain Point U coords
-    simdscalar*     pDomainV;       // IN: (SIMD) Domain Point V coords
-    simdscalari     mask;           // IN: Active mask for shader
-    simdscalar*     pOutputData;    // OUT: (SIMD) Vertex Attributes (2D array of vectors, one row per attribute-component)
-    SWR_SHADER_STATS stats;         // OUT: shader statistics used for archrast.
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_GS_CONTEXT
-/// @brief Input to geometry shader.
-/////////////////////////////////////////////////////////////////////////
-struct SWR_GS_CONTEXT
-{
-    simdvector* pVerts;                    // IN: input primitive data for SIMD prims
-    uint32_t    inputVertStride;           // IN: input vertex stride, in attributes
-    simdscalari PrimitiveID;               // IN: input primitive ID generated from the draw call
-    uint32_t    InstanceID;                // IN: input instance ID
-    simdscalari mask;                      // IN: Active mask for shader
-    uint8_t*    pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains vertices for all output streams)
-    SWR_SHADER_STATS stats;                // OUT: shader statistics used for archrast.
-};
-
-struct PixelPositions
-{
-    simdscalar UL;
-    simdscalar center;
-    simdscalar sample;
-    simdscalar centroid;
-};
-
-#define SWR_MAX_NUM_MULTISAMPLES 16
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_PS_CONTEXT
-/// @brief Input to pixel shader.
-/////////////////////////////////////////////////////////////////////////
-struct SWR_PS_CONTEXT
-{
-    PixelPositions vX;         // IN: x location(s) of pixels
-    PixelPositions vY;         // IN: x location(s) of pixels
-    simdscalar     vZ;         // INOUT: z location of pixels
-    simdscalari    activeMask; // OUT: mask for kill
-    simdscalar     inputMask;  // IN: input coverage mask for all samples
-    simdscalari    oMask;      // OUT: mask for output coverage
-
-    PixelPositions vI; // barycentric coords evaluated at pixel center, sample position, centroid
-    PixelPositions vJ;
-    PixelPositions vOneOverW; // IN: 1/w
-
-    const float* pAttribs;      // IN: pointer to attribute barycentric coefficients
-    const float* pPerspAttribs; // IN: pointer to attribute/w barycentric coefficients
-    const float* pRecipW;       // IN: pointer to 1/w coord for each vertex
-    const float* I;             // IN: Barycentric A, B, and C coefs used to compute I
-    const float* J;             // IN: Barycentric A, B, and C coefs used to compute J
-    float        recipDet;      // IN: 1/Det, used when barycentric interpolating attributes
-    const float* pSamplePosX;   // IN: array of sample positions
-    const float* pSamplePosY;   // IN: array of sample positions
-    simdvector   shaded[SWR_NUM_RENDERTARGETS]; // OUT: result color per rendertarget
-
-    uint32_t frontFace;              // IN: front- 1, back- 0
-    uint32_t sampleIndex;            // IN: sampleIndex
-    uint32_t renderTargetArrayIndex; // IN: render target array index from GS
-    uint32_t viewportIndex;          // IN: viewport index from GS
-    uint32_t rasterizerSampleCount;  // IN: sample count used by the rasterizer
-
-    uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; // IN: Pointers to render target hottiles
-
-    SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
-
-    BucketManager *pBucketManager; // @llvm_struct - IN: performance buckets.
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_CS_CONTEXT
-/// @brief Input to compute shader.
-/////////////////////////////////////////////////////////////////////////
-struct SWR_CS_CONTEXT
-{
-    // The ThreadGroupId is the current thread group index relative
-    // to all thread groups in the Dispatch call. The ThreadId, ThreadIdInGroup,
-    // and ThreadIdInGroupFlattened can be derived from ThreadGroupId in the shader.
-
-    // Compute shader accepts the following system values.
-    // o ThreadId - Current thread id relative to all other threads in dispatch.
-    // o ThreadGroupId - Current thread group id relative to all other groups in dispatch.
-    // o ThreadIdInGroup - Current thread relative to all threads in the current thread group.
-    // o ThreadIdInGroupFlattened - Flattened linear id derived from ThreadIdInGroup.
-    //
-    // All of these system values can be computed in the shader. They will be
-    // derived from the current tile counter. The tile counter is an atomic counter that
-    // resides in the draw context and is initialized to the product of the dispatch dims.
-    //
-    //  tileCounter = dispatchDims.x * dispatchDims.y * dispatchDims.z
-    //
-    // Each CPU worker thread will atomically decrement this counter and passes the current
-    // count into the shader. When the count reaches 0 then all thread groups in the
-    // dispatch call have been completed.
-
-    uint32_t tileCounter; // The tile counter value for this thread group.
-
-    // Dispatch dimensions used by shader to compute system values from the tile counter.
-    uint32_t dispatchDims[3];
-
-    uint8_t* pTGSM;               // Thread Group Shared Memory pointer.
-    uint8_t* pSpillFillBuffer;    // Spill/fill buffer for barrier support
-    uint8_t* pScratchSpace;       // Pointer to scratch space buffer used by the shader, shader is
-                                  // responsible for subdividing scratch space per instance/simd
-    uint32_t scratchSpacePerWarp; // Scratch space per work item x SIMD_WIDTH
-
-    SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
-};
-
-// enums
-enum SWR_TILE_MODE
-{
-    SWR_TILE_NONE = 0x0,     // Linear mode (no tiling)
-    SWR_TILE_MODE_WMAJOR,    // W major tiling
-    SWR_TILE_MODE_XMAJOR,    // X major tiling
-    SWR_TILE_MODE_YMAJOR,    // Y major tiling
-    SWR_TILE_SWRZ,           // SWR-Z tiling
-
-
-    SWR_TILE_MODE_COUNT
-};
-
-enum SWR_SURFACE_TYPE
-{
-    SURFACE_1D                = 0,
-    SURFACE_2D                = 1,
-    SURFACE_3D                = 2,
-    SURFACE_CUBE              = 3,
-    SURFACE_BUFFER            = 4,
-    SURFACE_STRUCTURED_BUFFER = 5,
-    SURFACE_NULL              = 7
-};
-
-enum SWR_ZFUNCTION
-{
-    ZFUNC_ALWAYS,
-    ZFUNC_NEVER,
-    ZFUNC_LT,
-    ZFUNC_EQ,
-    ZFUNC_LE,
-    ZFUNC_GT,
-    ZFUNC_NE,
-    ZFUNC_GE,
-    NUM_ZFUNC
-};
-
-enum SWR_STENCILOP
-{
-    STENCILOP_KEEP,
-    STENCILOP_ZERO,
-    STENCILOP_REPLACE,
-    STENCILOP_INCRSAT,
-    STENCILOP_DECRSAT,
-    STENCILOP_INCR,
-    STENCILOP_DECR,
-    STENCILOP_INVERT
-};
-
-enum SWR_BLEND_FACTOR
-{
-    BLENDFACTOR_ONE,
-    BLENDFACTOR_SRC_COLOR,
-    BLENDFACTOR_SRC_ALPHA,
-    BLENDFACTOR_DST_ALPHA,
-    BLENDFACTOR_DST_COLOR,
-    BLENDFACTOR_SRC_ALPHA_SATURATE,
-    BLENDFACTOR_CONST_COLOR,
-    BLENDFACTOR_CONST_ALPHA,
-    BLENDFACTOR_SRC1_COLOR,
-    BLENDFACTOR_SRC1_ALPHA,
-    BLENDFACTOR_ZERO,
-    BLENDFACTOR_INV_SRC_COLOR,
-    BLENDFACTOR_INV_SRC_ALPHA,
-    BLENDFACTOR_INV_DST_ALPHA,
-    BLENDFACTOR_INV_DST_COLOR,
-    BLENDFACTOR_INV_CONST_COLOR,
-    BLENDFACTOR_INV_CONST_ALPHA,
-    BLENDFACTOR_INV_SRC1_COLOR,
-    BLENDFACTOR_INV_SRC1_ALPHA
-};
-
-enum SWR_BLEND_OP
-{
-    BLENDOP_ADD,
-    BLENDOP_SUBTRACT,
-    BLENDOP_REVSUBTRACT,
-    BLENDOP_MIN,
-    BLENDOP_MAX,
-};
-
-enum SWR_LOGIC_OP
-{
-    LOGICOP_CLEAR,
-    LOGICOP_NOR,
-    LOGICOP_AND_INVERTED,
-    LOGICOP_COPY_INVERTED,
-    LOGICOP_AND_REVERSE,
-    LOGICOP_INVERT,
-    LOGICOP_XOR,
-    LOGICOP_NAND,
-    LOGICOP_AND,
-    LOGICOP_EQUIV,
-    LOGICOP_NOOP,
-    LOGICOP_OR_INVERTED,
-    LOGICOP_COPY,
-    LOGICOP_OR_REVERSE,
-    LOGICOP_OR,
-    LOGICOP_SET,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_AUX_MODE
-/// @brief Specifies how the auxiliary buffer is used by the driver.
-//////////////////////////////////////////////////////////////////////////
-enum SWR_AUX_MODE
-{
-    AUX_MODE_NONE,
-    AUX_MODE_COLOR,
-    AUX_MODE_UAV,
-    AUX_MODE_DEPTH,
-};
-
-// vertex fetch state
-// WARNING- any changes to this struct need to be reflected
-// in the fetch shader jit
-struct SWR_VERTEX_BUFFER_STATE
-{
-    gfxptr_t xpData;
-    uint32_t index;
-    uint32_t pitch;
-    uint32_t size;
-    uint32_t minVertex; // min vertex (for bounds checking)
-    uint32_t maxVertex; // size / pitch.  precalculated value used by fetch shader for OOB checks
-    uint32_t partialInboundsSize; // size % pitch.  precalculated value used by fetch shader for
-                                  // partially OOB vertices
-};
-
-struct SWR_INDEX_BUFFER_STATE
-{
-    gfxptr_t xpIndices;
-    // Format type for indices (e.g. UINT16, UINT32, etc.)
-    SWR_FORMAT format; // @llvm_enum
-    uint32_t   size;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_FETCH_CONTEXT
-/// @brief Input to fetch shader.
-/// @note WARNING - Changes to this struct need to be reflected in the
-///                 fetch shader jit.
-/////////////////////////////////////////////////////////////////////////
-struct SWR_FETCH_CONTEXT
-{
-    const SWR_VERTEX_BUFFER_STATE* pStreams;  // IN: array of bound vertex buffers
-    gfxptr_t                       xpIndices; // IN: pointer to int32 index buffer for indexed draws
-    gfxptr_t    xpLastIndex;   // IN: pointer to end of index buffer, used for bounds checking
-    uint32_t    CurInstance;   // IN: current instance
-    uint32_t    BaseVertex;    // IN: base vertex
-    uint32_t    StartVertex;   // IN: start vertex
-    uint32_t    StartInstance; // IN: start instance
-    simdscalari VertexID;      // OUT: vector of vertex IDs
-    simdscalari CutMask;       // OUT: vector mask of indices which have the cut index value
-#if USE_SIMD16_SHADERS
-    //    simd16scalari VertexID;                     // OUT: vector of vertex IDs
-    //    simd16scalari CutMask;                      // OUT: vector mask of indices which have the
-    //    cut index value
-    simdscalari VertexID2; // OUT: vector of vertex IDs
-    simdscalari CutMask2;  // OUT: vector mask of indices which have the cut index value
-#endif
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_STATS
-///
-/// @brief All statistics generated by SWR go here. These are public
-///        to driver.
-/////////////////////////////////////////////////////////////////////////
-OSALIGNLINE(struct) SWR_STATS
-{
-    // Occlusion Query
-    uint64_t DepthPassCount; // Number of passing depth tests. Not exact.
-
-    // Pipeline Stats
-    uint64_t PsInvocations; // Number of Pixel Shader invocations
-    uint64_t CsInvocations; // Number of Compute Shader invocations
-
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_STATS
-///
-/// @brief All statistics generated by FE.
-/////////////////////////////////////////////////////////////////////////
-OSALIGNLINE(struct) SWR_STATS_FE
-{
-    uint64_t IaVertices;    // Number of Fetch Shader vertices
-    uint64_t IaPrimitives;  // Number of PA primitives.
-    uint64_t VsInvocations; // Number of Vertex Shader invocations
-    uint64_t HsInvocations; // Number of Hull Shader invocations
-    uint64_t DsInvocations; // Number of Domain Shader invocations
-    uint64_t GsInvocations; // Number of Geometry Shader invocations
-    uint64_t GsPrimitives;  // Number of prims GS outputs.
-    uint64_t CInvocations;  // Number of clipper invocations
-    uint64_t CPrimitives;   // Number of clipper primitives.
-
-    // Streamout Stats
-    uint64_t SoPrimStorageNeeded[4];
-    uint64_t SoNumPrimsWritten[4];
-};
-
-    //////////////////////////////////////////////////////////////////////////
-    /// STREAMOUT_BUFFERS
-    /////////////////////////////////////////////////////////////////////////
-
-#define MAX_SO_STREAMS 4
-#define MAX_SO_BUFFERS 4
-#define MAX_ATTRIBUTES 32
-
-struct SWR_STREAMOUT_BUFFER
-{
-    // Pointers to streamout buffers.
-    gfxptr_t pBuffer;
-
-    // Offset to the SO write offset. If not null then we update offset here.
-    gfxptr_t pWriteOffset;
-
-    bool enable;
-    bool soWriteEnable;
-
-    // Size of buffer in dwords.
-    uint32_t bufferSize;
-
-    // Vertex pitch of buffer in dwords.
-    uint32_t pitch;
-
-    // Offset into buffer in dwords. SOS will increment this offset.
-    uint32_t streamOffset;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// STREAMOUT_STATE
-/////////////////////////////////////////////////////////////////////////
-struct SWR_STREAMOUT_STATE
-{
-    // This disables stream output.
-    bool soEnable;
-
-    // which streams are enabled for streamout
-    bool streamEnable[MAX_SO_STREAMS];
-
-    // If set then do not send any streams to the rasterizer.
-    bool rasterizerDisable;
-
-    // Specifies which stream to send to the rasterizer.
-    uint32_t streamToRasterizer;
-
-    // The stream masks specify which attributes are sent to which streams.
-    // These masks help the FE to setup the pPrimData buffer that is passed
-    // the Stream Output Shader (SOS) function.
-    uint64_t streamMasks[MAX_SO_STREAMS];
-
-    // Number of attributes, including position, per vertex that are streamed out.
-    // This should match number of bits in stream mask.
-    uint32_t streamNumEntries[MAX_SO_STREAMS];
-
-    // Offset to the start of the attributes of the input vertices, in simdvector units
-    uint32_t vertexAttribOffset[MAX_SO_STREAMS];
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// STREAMOUT_CONTEXT - Passed to SOS
-/////////////////////////////////////////////////////////////////////////
-struct SWR_STREAMOUT_CONTEXT
-{
-    uint32_t*             pPrimData;
-    SWR_STREAMOUT_BUFFER* pBuffer[MAX_SO_STREAMS];
-
-    // Num prims written for this stream
-    uint32_t numPrimsWritten;
-
-    // Num prims that should have been written if there were no overflow.
-    uint32_t numPrimStorageNeeded;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_GS_STATE - Geometry shader state
-/////////////////////////////////////////////////////////////////////////
-struct SWR_GS_STATE
-{
-    bool gsEnable;
-
-    // If true, geometry shader emits a single stream, with separate cut buffer.
-    // If false, geometry shader emits vertices for multiple streams to the stream buffer, with a
-    // separate StreamID buffer to map vertices to streams
-    bool isSingleStream;
-
-    // Number of input attributes per vertex. Used by the frontend to
-    // optimize assembling primitives for GS
-    uint32_t numInputAttribs;
-
-    // Stride of incoming verts in attributes
-    uint32_t inputVertStride;
-
-    // Output topology - can be point, tristrip, linestrip, or rectlist
-    PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum
-
-    // Maximum number of verts that can be emitted by a single instance of the GS
-    uint32_t maxNumVerts;
-
-    // Instance count
-    uint32_t instanceCount;
-
-    // When single stream is enabled, singleStreamID dictates which stream is being output.
-    // field ignored if isSingleStream is false
-    uint32_t singleStreamID;
-
-    // Total amount of memory to allocate for one instance of the shader output in bytes
-    uint32_t allocationSize;
-
-    // Offset to start reading data per input vertex in simdvector units. This can be used to
-    // skip over any vertex data output from the previous stage that is unused in the GS, removing
-    // unnecessary vertex processing.
-    uint32_t vertexAttribOffset;
-
-    // Size of the control data section which contains cut or streamID data, in simdscalar units.
-    // Should be sized to handle the maximum number of verts output by the GS. Can be 0 if there are
-    // no cuts or streamID bits.
-    uint32_t controlDataSize;
-
-    // Offset to the control data section, in bytes
-    uint32_t controlDataOffset;
-
-    // Total size of an output vertex, in simdvector units
-    uint32_t outputVertexSize;
-
-    // Offset to the start of the vertex section, in bytes
-    uint32_t outputVertexOffset;
-
-    // Set this to non-zero to indicate that the shader outputs a static number of verts. If zero,
-    // shader is expected to store the final vertex count in the first dword of the gs output
-    // stream.
-    uint32_t staticVertexCount;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS
-/////////////////////////////////////////////////////////////////////////
-enum SWR_TS_OUTPUT_TOPOLOGY
-{
-    SWR_TS_OUTPUT_POINT,
-    SWR_TS_OUTPUT_LINE,
-    SWR_TS_OUTPUT_TRI_CW,
-    SWR_TS_OUTPUT_TRI_CCW,
-
-    SWR_TS_OUTPUT_TOPOLOGY_COUNT
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_TS_PARTITIONING - Defines tessellation algorithm
-/////////////////////////////////////////////////////////////////////////
-enum SWR_TS_PARTITIONING
-{
-    SWR_TS_INTEGER,
-    SWR_TS_ODD_FRACTIONAL,
-    SWR_TS_EVEN_FRACTIONAL,
-
-    SWR_TS_PARTITIONING_COUNT
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_TS_DOMAIN - Defines Tessellation Domain
-/////////////////////////////////////////////////////////////////////////
-enum SWR_TS_DOMAIN
-{
-    SWR_TS_QUAD,
-    SWR_TS_TRI,
-    SWR_TS_ISOLINE,
-
-    SWR_TS_DOMAIN_COUNT
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_TS_STATE - Tessellation state
-/////////////////////////////////////////////////////////////////////////
-struct SWR_TS_STATE
-{
-    bool tsEnable;
-
-    SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology; // @llvm_enum
-    SWR_TS_PARTITIONING    partitioning;     // @llvm_enum
-    SWR_TS_DOMAIN          domain;           // @llvm_enum
-
-    PRIMITIVE_TOPOLOGY postDSTopology; // @llvm_enum
-
-    uint32_t numHsInputAttribs;
-    uint32_t numHsOutputAttribs;
-    uint32_t hsAllocationSize; // Size of HS output in bytes, per lane
-
-    uint32_t numDsOutputAttribs;
-    uint32_t dsAllocationSize;
-    uint32_t dsOutVtxAttribOffset;
-
-    // Offset to the start of the attributes of the input vertices, in simdvector units
-    uint32_t srcVertexAttribOffset;
-
-    // Offset to the start of the attributes expected by the hull shader
-    uint32_t vertexAttribOffset;
-};
-
-// output merger state
-struct SWR_RENDER_TARGET_BLEND_STATE
-{
-    uint8_t writeDisableRed : 1;
-    uint8_t writeDisableGreen : 1;
-    uint8_t writeDisableBlue : 1;
-    uint8_t writeDisableAlpha : 1;
-};
-static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1,
-              "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
-
-enum SWR_MULTISAMPLE_COUNT
-{
-    SWR_MULTISAMPLE_1X = 0,
-    SWR_MULTISAMPLE_2X,
-    SWR_MULTISAMPLE_4X,
-    SWR_MULTISAMPLE_8X,
-    SWR_MULTISAMPLE_16X,
-    SWR_MULTISAMPLE_TYPE_COUNT
-};
-
-static INLINE uint32_t GetNumSamples(/* SWR_SAMPLE_COUNT */ int sampleCountEnum) // @llvm_func_start
-{
-    return uint32_t(1) << sampleCountEnum;
-} // @llvm_func_end
-
-struct SWR_BLEND_STATE
-{
-    // constant blend factor color in RGBA float
-    float constantColor[4];
-
-    // alpha test reference value in unorm8 or float32
-    uint32_t alphaTestReference;
-    uint32_t sampleMask;
-    // all RT's have the same sample count
-    ///@todo move this to Output Merger state when we refactor
-    SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum
-
-    SWR_RENDER_TARGET_BLEND_STATE renderTarget[SWR_NUM_RENDERTARGETS];
-};
-static_assert(sizeof(SWR_BLEND_STATE) == 36, "Invalid SWR_BLEND_STATE size");
-
-struct SWR_BLEND_CONTEXT
-{
-    const SWR_BLEND_STATE* pBlendState;
-    simdvector*            src;
-    simdvector*            src1;
-    simdvector*            src0alpha;
-    uint32_t               sampleNum;
-    simdvector*            pDst;
-    simdvector*            result;
-    simdscalari*           oMask;
-    simdscalari*           pMask;
-    uint32_t               isAlphaTested;
-    uint32_t               isAlphaBlended;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// FUNCTION POINTERS FOR SHADERS
-
-#if USE_SIMD16_SHADERS
-typedef void(__cdecl *PFN_FETCH_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_FETCH_CONTEXT& fetchInfo, simd16vertex& out);
-#else
-typedef void(__cdecl *PFN_FETCH_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
-#endif
-typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_VS_CONTEXT* pVsContext);
-typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_HS_CONTEXT* pHsContext);
-typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_DS_CONTEXT* pDsContext);
-typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext);
-typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext);
-typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_STREAMOUT_CONTEXT& soContext);
-typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
-typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
-typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*);
-typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar const &);
-
-
-//////////////////////////////////////////////////////////////////////////
-/// FRONTEND_STATE
-/////////////////////////////////////////////////////////////////////////
-struct SWR_FRONTEND_STATE
-{
-    // skip clip test, perspective divide, and viewport transform
-    // intended for verts in screen space
-    bool vpTransformDisable;
-    bool bEnableCutIndex;
-    union
-    {
-        struct
-        {
-            uint32_t triFan : 2;
-            uint32_t lineStripList : 1;
-            uint32_t triStripList : 2;
-        };
-        uint32_t bits;
-    } provokingVertex;
-    uint32_t topologyProvokingVertex; // provoking vertex for the draw topology
-
-    // Size of a vertex in simdvector units. Should be sized to the
-    // maximum of the input/output of the vertex shader.
-    uint32_t vsVertexSize;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// VIEWPORT_MATRIX
-/////////////////////////////////////////////////////////////////////////
-struct SWR_VIEWPORT_MATRIX
-{
-    float m00;
-    float m11;
-    float m22;
-    float m30;
-    float m31;
-    float m32;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// VIEWPORT_MATRIXES
-/////////////////////////////////////////////////////////////////////////
-struct SWR_VIEWPORT_MATRICES
-{
-    float m00[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float m11[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float m22[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float m30[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float m31[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float m32[KNOB_NUM_VIEWPORTS_SCISSORS];
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_VIEWPORT
-/////////////////////////////////////////////////////////////////////////
-struct SWR_VIEWPORT
-{
-    float x;
-    float y;
-    float width;
-    float height;
-    float minZ;
-    float maxZ;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_CULLMODE
-//////////////////////////////////////////////////////////////////////////
-enum SWR_CULLMODE
-{
-    SWR_CULLMODE_BOTH,
-    SWR_CULLMODE_NONE,
-    SWR_CULLMODE_FRONT,
-    SWR_CULLMODE_BACK
-};
-
-enum SWR_FILLMODE
-{
-    SWR_FILLMODE_POINT,
-    SWR_FILLMODE_WIREFRAME,
-    SWR_FILLMODE_SOLID
-};
-
-enum SWR_FRONTWINDING
-{
-    SWR_FRONTWINDING_CW,
-    SWR_FRONTWINDING_CCW
-};
-
-
-enum SWR_PIXEL_LOCATION
-{
-    SWR_PIXEL_LOCATION_CENTER,
-    SWR_PIXEL_LOCATION_UL,
-};
-
-// fixed point screen space sample locations within a pixel
-struct SWR_MULTISAMPLE_POS
-{
-public:
-    INLINE void SetXi(uint32_t sampleNum, uint32_t val) { _xi[sampleNum] = val; };   // @llvm_func
-    INLINE void SetYi(uint32_t sampleNum, uint32_t val) { _yi[sampleNum] = val; };   // @llvm_func
-    INLINE uint32_t Xi(uint32_t sampleNum) const { return _xi[sampleNum]; };         // @llvm_func
-    INLINE uint32_t Yi(uint32_t sampleNum) const { return _yi[sampleNum]; };         // @llvm_func
-    INLINE void     SetX(uint32_t sampleNum, float val) { _x[sampleNum] = val; };    // @llvm_func
-    INLINE void     SetY(uint32_t sampleNum, float val) { _y[sampleNum] = val; };    // @llvm_func
-    INLINE float    X(uint32_t sampleNum) const { return _x[sampleNum]; };           // @llvm_func
-    INLINE float    Y(uint32_t sampleNum) const { return _y[sampleNum]; };           // @llvm_func
-    typedef const float (&sampleArrayT)[SWR_MAX_NUM_MULTISAMPLES];                   //@llvm_typedef
-    INLINE sampleArrayT X() const { return _x; };                                    // @llvm_func
-    INLINE sampleArrayT Y() const { return _y; };                                    // @llvm_func
-    INLINE const __m128i& vXi(uint32_t sampleNum) const { return _vXi[sampleNum]; }; // @llvm_func
-    INLINE const __m128i& vYi(uint32_t sampleNum) const { return _vYi[sampleNum]; }; // @llvm_func
-    INLINE const simdscalar& vX(uint32_t sampleNum) const { return _vX[sampleNum]; }; // @llvm_func
-    INLINE const simdscalar& vY(uint32_t sampleNum) const { return _vY[sampleNum]; }; // @llvm_func
-    INLINE const __m128i& TileSampleOffsetsX() const { return tileSampleOffsetsX; };  // @llvm_func
-    INLINE const __m128i& TileSampleOffsetsY() const { return tileSampleOffsetsY; };  // @llvm_func
-
-    INLINE void PrecalcSampleData(int numSamples); //@llvm_func
-
-private:
-    template <typename MaskT>
-    INLINE __m128i expandThenBlend4(uint32_t* min, uint32_t* max); // @llvm_func
-    INLINE void    CalcTileSampleOffsets(int numSamples);          // @llvm_func
-
-    // scalar sample values
-    uint32_t _xi[SWR_MAX_NUM_MULTISAMPLES];
-    uint32_t _yi[SWR_MAX_NUM_MULTISAMPLES];
-    float    _x[SWR_MAX_NUM_MULTISAMPLES];
-    float    _y[SWR_MAX_NUM_MULTISAMPLES];
-
-    // precalc'd / vectorized samples
-    __m128i    _vXi[SWR_MAX_NUM_MULTISAMPLES];
-    __m128i    _vYi[SWR_MAX_NUM_MULTISAMPLES];
-    simdscalar _vX[SWR_MAX_NUM_MULTISAMPLES];
-    simdscalar _vY[SWR_MAX_NUM_MULTISAMPLES];
-    __m128i    tileSampleOffsetsX;
-    __m128i    tileSampleOffsetsY;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_RASTSTATE
-//////////////////////////////////////////////////////////////////////////
-struct SWR_RASTSTATE
-{
-    uint32_t cullMode : 2;
-    uint32_t fillMode : 2;
-    uint32_t frontWinding : 1;
-    uint32_t scissorEnable : 1;
-    uint32_t depthClipEnable : 1;
-    uint32_t clipEnable : 1;
-    uint32_t clipHalfZ : 1;
-    uint32_t pointParam : 1;
-    uint32_t pointSpriteEnable : 1;
-    uint32_t pointSpriteTopOrigin : 1;
-    uint32_t forcedSampleCount : 1;
-    uint32_t pixelOffset : 1;
-    uint32_t depthBiasPreAdjusted : 1; ///< depth bias constant is in float units, not per-format Z units
-    uint32_t conservativeRast : 1;
-
-    float pointSize;
-    float lineWidth;
-
-    float      depthBias;
-    float      slopeScaledDepthBias;
-    float      depthBiasClamp;
-    SWR_FORMAT depthFormat; // @llvm_enum
-
-    // sample count the rasterizer is running at
-    SWR_MULTISAMPLE_COUNT sampleCount;      // @llvm_enum
-    uint32_t              pixelLocation;    // UL or Center
-    SWR_MULTISAMPLE_POS   samplePositions;  // @llvm_struct
-    bool                  bIsCenterPattern; // @llvm_enum
-};
-
-
-enum SWR_CONSTANT_SOURCE
-{
-    SWR_CONSTANT_SOURCE_CONST_0000,
-    SWR_CONSTANT_SOURCE_CONST_0001_FLOAT,
-    SWR_CONSTANT_SOURCE_CONST_1111_FLOAT,
-    SWR_CONSTANT_SOURCE_PRIM_ID
-};
-
-struct SWR_ATTRIB_SWIZZLE
-{
-    uint16_t sourceAttrib : 5;          // source attribute
-    uint16_t constantSource : 2;        // constant source to apply
-    uint16_t componentOverrideMask : 4; // override component with constant source
-};
-
-// backend state
-struct SWR_BACKEND_STATE
-{
-    uint32_t constantInterpolationMask; // bitmask indicating which attributes have constant
-                                        // interpolation
-    uint32_t pointSpriteTexCoordMask;   // bitmask indicating the attribute(s) which should be
-                                        // interpreted as tex coordinates
-
-    bool swizzleEnable;        // when enabled, core will parse the swizzle map when
-                               // setting up attributes for the backend, otherwise
-                               // all attributes up to numAttributes will be sent
-    uint8_t numAttributes;     // total number of attributes to send to backend (up to 32)
-    uint8_t numComponents[32]; // number of components to setup per attribute, this reduces some
-                               // calculations for unneeded components
-
-    bool readRenderTargetArrayIndex; // Forward render target array index from last FE stage to the
-                                     // backend
-    bool readViewportArrayIndex;     // Read viewport array index from last FE stage during binning
-
-    // User clip/cull distance enables
-    uint8_t cullDistanceMask;
-    uint8_t clipDistanceMask;
-
-    // padding to ensure swizzleMap starts 64B offset from start of the struct
-    // and that the next fields are dword aligned.
-    uint8_t pad[10];
-
-    // Offset to the start of the attributes of the input vertices, in simdvector units
-    uint32_t vertexAttribOffset;
-
-    // Offset to clip/cull attrib section of the vertex, in simdvector units
-    uint32_t vertexClipCullOffset;
-
-    SWR_ATTRIB_SWIZZLE swizzleMap[32];
-};
-static_assert(sizeof(SWR_BACKEND_STATE) == 128,
-              "Adjust padding to keep size (or remove this assert)");
-
-
-union SWR_DEPTH_STENCIL_STATE
-{
-    struct
-    {
-        // dword 0
-        uint32_t depthWriteEnable : 1;
-        uint32_t depthTestEnable : 1;
-        uint32_t stencilWriteEnable : 1;
-        uint32_t stencilTestEnable : 1;
-        uint32_t doubleSidedStencilTestEnable : 1;
-
-        uint32_t depthTestFunc : 3;
-        uint32_t stencilTestFunc : 3;
-
-        uint32_t backfaceStencilPassDepthPassOp : 3;
-        uint32_t backfaceStencilPassDepthFailOp : 3;
-        uint32_t backfaceStencilFailOp : 3;
-        uint32_t backfaceStencilTestFunc : 3;
-        uint32_t stencilPassDepthPassOp : 3;
-        uint32_t stencilPassDepthFailOp : 3;
-        uint32_t stencilFailOp : 3;
-
-        // dword 1
-        uint8_t backfaceStencilWriteMask;
-        uint8_t backfaceStencilTestMask;
-        uint8_t stencilWriteMask;
-        uint8_t stencilTestMask;
-
-        // dword 2
-        uint8_t backfaceStencilRefValue;
-        uint8_t stencilRefValue;
-    };
-    uint32_t value[3];
-};
-
-enum SWR_SHADING_RATE
-{
-    SWR_SHADING_RATE_PIXEL,
-    SWR_SHADING_RATE_SAMPLE,
-    SWR_SHADING_RATE_COUNT,
-};
-
-enum SWR_INPUT_COVERAGE
-{
-    SWR_INPUT_COVERAGE_NONE,
-    SWR_INPUT_COVERAGE_NORMAL,
-    SWR_INPUT_COVERAGE_INNER_CONSERVATIVE,
-    SWR_INPUT_COVERAGE_COUNT,
-};
-
-enum SWR_PS_POSITION_OFFSET
-{
-    SWR_PS_POSITION_SAMPLE_NONE,
-    SWR_PS_POSITION_SAMPLE_OFFSET,
-    SWR_PS_POSITION_CENTROID_OFFSET,
-    SWR_PS_POSITION_OFFSET_COUNT,
-};
-
-enum SWR_BARYCENTRICS_MASK
-{
-    SWR_BARYCENTRIC_PER_PIXEL_MASK  = 0x1,
-    SWR_BARYCENTRIC_CENTROID_MASK   = 0x2,
-    SWR_BARYCENTRIC_PER_SAMPLE_MASK = 0x4,
-};
-
-// pixel shader state
-struct SWR_PS_STATE
-{
-    // dword 0-1
-    PFN_PIXEL_KERNEL pfnPixelShader; // @llvm_pfn
-
-    // dword 2
-    uint32_t killsPixel : 1;      // pixel shader can kill pixels
-    uint32_t inputCoverage : 2;   // ps uses input coverage
-    uint32_t writesODepth : 1;    // pixel shader writes to depth
-    uint32_t usesSourceDepth : 1; // pixel shader reads depth
-    uint32_t shadingRate : 2;     // shading per pixel / sample / coarse pixel
-    uint32_t posOffset : 2; // type of offset (none, sample, centroid) to add to pixel position
-    uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate
-                                   // attributes with
-    uint32_t usesUAV : 1;          // pixel shader accesses UAV
-    uint32_t forceEarlyZ : 1;      // force execution of early depth/stencil test
-
-    uint8_t renderTargetMask; // Mask of render targets written
-};
-
-// depth bounds state
-struct SWR_DEPTH_BOUNDS_STATE
-{
-    bool  depthBoundsTestEnable;
-    float depthBoundsTestMinValue;
-    float depthBoundsTestMaxValue;
-};
-// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h
deleted file mode 100644
index 99eac835ea8..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file state.h
- *
- * @brief Definitions for API state - complex function implementation.
- *
- ******************************************************************************/
-#pragma once
-
-#include "core/state.h"
-#include "common/simdintrin.h"
-
-template <typename MaskT>
-INLINE __m128i SWR_MULTISAMPLE_POS::expandThenBlend4(uint32_t* min, uint32_t* max)
-{
-    __m128i vMin = _mm_set1_epi32(*min);
-    __m128i vMax = _mm_set1_epi32(*max);
-    return _simd_blend4_epi32<MaskT::value>(vMin, vMax);
-}
-
-INLINE void SWR_MULTISAMPLE_POS::PrecalcSampleData(int numSamples)
-{
-    for (int i = 0; i < numSamples; i++)
-    {
-        _vXi[i] = _mm_set1_epi32(_xi[i]);
-        _vYi[i] = _mm_set1_epi32(_yi[i]);
-        _vX[i]  = _simd_set1_ps(_x[i]);
-        _vY[i]  = _simd_set1_ps(_y[i]);
-    }
-    // precalculate the raster tile BB for the rasterizer.
-    CalcTileSampleOffsets(numSamples);
-}
-
-INLINE void SWR_MULTISAMPLE_POS::CalcTileSampleOffsets(int numSamples)
-{
-    auto minXi  = std::min_element(std::begin(_xi), &_xi[numSamples]);
-    auto maxXi  = std::max_element(std::begin(_xi), &_xi[numSamples]);
-    using xMask = std::integral_constant<int, 0xA>;
-    // BR(max),    BL(min),    UR(max),    UL(min)
-    tileSampleOffsetsX = expandThenBlend4<xMask>(minXi, maxXi);
-
-    auto minYi  = std::min_element(std::begin(_yi), &_yi[numSamples]);
-    auto maxYi  = std::max_element(std::begin(_yi), &_yi[numSamples]);
-    using yMask = std::integral_constant<int, 0xC>;
-    // BR(max),    BL(min),    UR(max),    UL(min)
-    tileSampleOffsetsY = expandThenBlend4<yMask>(minYi, maxYi);
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.cpp b/src/gallium/drivers/swr/rasterizer/core/tessellator.cpp
deleted file mode 100644
index 08f2bce339c..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/tessellator.cpp
+++ /dev/null
@@ -1,2689 +0,0 @@
-/*
-    Copyright (c) Microsoft Corporation
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-    associated documentation files (the "Software"), to deal in the Software without restriction,
-    including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-    and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
-    subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in all copies or substantial
-    portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
-    NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include "tessellator.hpp"
-#if defined(_MSC_VER)
-#include <math.h> // ceil
-#else
-#include <cmath>
-#endif
-//#include <windows.h> // Just used for some commented out debug stat printing.
-//#include <strsafe.h> // Ditto.
-#define min(x,y) (x < y ? x : y)
-#define max(x,y) (x > y ? x : y)
-
-//=================================================================================================================================
-// Some D3D Compliant Float Math (reference rasterizer implements these in RefALU class)
-//=================================================================================================================================
-//
-//---------------------------------------------------------------------------------------------------------------------------------
-// isNaN
-//---------------------------------------------------------------------------------------------------------------------------------
-static bool tess_isNaN( float a )
-{
-    static const int exponentMask = 0x7f800000;
-    static const int mantissaMask = 0x007fffff;
-    int u = *(int*)&a;
-    return ( ( ( u & exponentMask ) == exponentMask ) && ( u & mantissaMask ) ); // NaN
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// flush (denorm)
-//---------------------------------------------------------------------------------------------------------------------------------
-static float tess_flush( float a )
-{
-    static const int minNormalizedFloat = 0x00800000;
-    static const int signBit = 0x80000000;
-    static const int signBitComplement = 0x7fffffff;
-    int b = (*(int*)&a) & signBitComplement; // fabs()
-    if( b < minNormalizedFloat ) // UINT comparison. NaN/INF do test false here
-    {
-        b = signBit & (*(int*)&a);
-        return *(float*)&b;
-    }
-    return a;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// IEEE754R min
-//---------------------------------------------------------------------------------------------------------------------------------
-static float tess_fmin( float a, float b )
-{
-    float _a = tess_flush( a );
-    float _b = tess_flush( b );
-    if( tess_isNaN( _b ) )
-    {
-        return a;
-    }
-    else if( ( _a == 0 ) && ( _b == 0 ) )
-    {
-        return ( (*(int*)&_a) & 0x80000000 ) ? a : b;
-    }
-    return _a < _b ? a : b;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// IEEE754R max
-//---------------------------------------------------------------------------------------------------------------------------------
-static float tess_fmax( float a, float b )
-{
-    float _a = tess_flush( a );
-    float _b = tess_flush( b );
-
-    if( tess_isNaN( _b ) )
-    {
-        return a;
-    }
-    else if( ( _a == 0 ) && ( _b == 0 ) )
-    {
-        return ( (*(int*)&_b) & 0x80000000 ) ? a : b;
-    }
-    return _a >= _b ? a : b;
-}
-
-//=================================================================================================================================
-// Fixed Point Math
-//=================================================================================================================================
-
-//-----------------------------------------------------------------------------------------------------------------------------
-// floatToFixedPoint
-//
-// Convert 32-bit float to 32-bit fixed point integer, using only
-// integer arithmetic + bitwise operations.
-//
-// c_uIBits:  UINT8     : Width of i (aka. integer bits)
-// c_uFBits:  UINT8     : Width of f (aka. fractional bits)
-// c_bSigned: bool      : Whether the integer bits are a 2's complement signed value
-// input:     float     : All values valid.
-// output:    INT32     : At most 24 bits from LSB are meaningful, depending
-//                        on the fixed point bit representation chosen (see
-//                        below).  Extra bits are sign extended from the most
-//                        meaningful bit.
-//
-//-----------------------------------------------------------------------------------------------------------------------------
-
-typedef unsigned char UINT8;
-typedef int INT32;
-template< const UINT8 c_uIBits, const UINT8 c_uFBits, const bool c_bSigned >
-INT32 floatToIDotF( const float& input )
-{
-    // ------------------------------------------------------------------------
-    //                                                output fixed point format
-    // 32-bit result:
-    //
-    //      [sign-extend]i.f
-    //      |              |
-    //      MSB(31)...LSB(0)
-    //
-    //      f               fractional part of the number, an unsigned
-    //                      value with _fxpFracBitCount bits (defined below)
-    //
-    //      .               implied decimal
-    //
-    //      i               integer part of the number, a 2's complement
-    //                      value with _fxpIntBitCount bits (defined below)
-    //
-    //      [sign-extend]   MSB of i conditionally replicated
-    //
-    // ------------------------------------------------------------------------
-    // Define fixed point bit counts
-    //
-
-    // Commenting out C_ASSERT below to minimise #includes:
-    // C_ASSERT( 2 <= c_uIBits && c_uIBits <= 32 && c_uFBits <= 32 && c_uIBits + c_uFBits <= 32 );
-
-    // Define most negative and most positive fixed point values
-    const INT32 c_iMinResult = (c_bSigned ? INT32( -1 ) << (c_uIBits + c_uFBits - 1) : 0);
-    const INT32 c_iMaxResult = ~c_iMinResult;
-
-    // ------------------------------------------------------------------------
-    //                                                constant float properties
-    // ------------------------------------------------------------------------
-    const UINT8 _fltMantissaBitCount = 23;
-    const UINT8 _fltExponentBitCount = 8;
-    const INT32 _fltExponentBias     = (INT32( 1 ) << (_fltExponentBitCount - 1)) - 1;
-    const INT32 _fltHiddenBit        = INT32( 1 ) << _fltMantissaBitCount;
-    const INT32 _fltMantissaMask     = _fltHiddenBit - 1;
-    const INT32 _fltExponentMask     = ((INT32( 1 ) << _fltExponentBitCount) - 1) << _fltMantissaBitCount;
-    const INT32 _fltSignBit          = INT32( 1 ) << (_fltExponentBitCount + _fltMantissaBitCount);
-
-    // ------------------------------------------------------------------------
-    //              define min and max values as floats (clamp to these bounds)
-    // ------------------------------------------------------------------------
-    INT32 _fxpMaxPosValueFloat;
-    INT32 _fxpMaxNegValueFloat;
-
-    if (c_bSigned)
-    {
-        // The maximum positive fixed point value is 2^(i-1) - 2^(-f).
-        // The following constructs the floating point bit pattern for this value,
-        // as long as i >= 2.
-        _fxpMaxPosValueFloat = (_fltExponentBias + c_uIBits - 1) <<_fltMantissaBitCount;
-        const INT32 iShift = _fltMantissaBitCount + 2 - c_uIBits - c_uFBits;
-        if (iShift >= 0)
-        {
-//            assert( iShift < 32 );
-#if defined(_MSC_VER)
-#pragma warning( suppress : 4293 )
-#endif
-            _fxpMaxPosValueFloat -= INT32( 1 ) << iShift;
-        }
-
-        // The maximum negative fixed point value is -2^(i-1).
-        // The following constructs the floating point bit pattern for this value,
-        // as long as i >= 2.
-        // We need this number without the sign bit
-        _fxpMaxNegValueFloat = (_fltExponentBias + c_uIBits - 1) << _fltMantissaBitCount;
-    }
-    else
-    {
-        // The maximum positive fixed point value is 2^(i) - 2^(-f).
-        // The following constructs the floating point bit pattern for this value,
-        // as long as i >= 2.
-        _fxpMaxPosValueFloat = (_fltExponentBias + c_uIBits) <<_fltMantissaBitCount;
-        const INT32 iShift = _fltMantissaBitCount + 1 - c_uIBits - c_uFBits;
-        if (iShift >= 0)
-        {
-//            assert( iShift < 32 );
-#if defined(_MSC_VER)
-#pragma warning( suppress : 4293 )
-#endif
-            _fxpMaxPosValueFloat -= INT32( 1 ) << iShift;
-        }
-
-        // The maximum negative fixed point value is 0.
-        _fxpMaxNegValueFloat = 0;
-    }
-
-    // ------------------------------------------------------------------------
-    //                                                float -> fixed conversion
-    // ------------------------------------------------------------------------
-
-    // ------------------------------------------------------------------------
-    //                                                      examine input float
-    // ------------------------------------------------------------------------
-    INT32 output              = *(INT32*)&input;
-    INT32 unbiasedExponent    = ((output & _fltExponentMask) >> _fltMantissaBitCount) - _fltExponentBias;
-    INT32 isNegative          = output & _fltSignBit;
-
-    // ------------------------------------------------------------------------
-    //                                                                      nan
-    // ------------------------------------------------------------------------
-    if (unbiasedExponent == (_fltExponentBias + 1) && (output & _fltMantissaMask))
-    {
-        // nan converts to 0
-        output = 0;
-    }
-    // ------------------------------------------------------------------------
-    //                                                       too large positive
-    // ------------------------------------------------------------------------
-    else if (!isNegative && output >= _fxpMaxPosValueFloat) // integer compare
-    {
-        output = c_iMaxResult;
-    }
-    // ------------------------------------------------------------------------
-    //                                                       too large negative
-    // ------------------------------------------------------------------------
-                                            // integer compare
-    else if (isNegative && (output & ~_fltSignBit) >= _fxpMaxNegValueFloat)
-    {
-        output = c_iMinResult;
-    }
-    // ------------------------------------------------------------------------
-    //                                                                too small
-    // ------------------------------------------------------------------------
-    else if (unbiasedExponent < -c_uFBits - 1)
-    {
-        // clamp to 0
-        output = 0;
-    }
-    // ------------------------------------------------------------------------
-    //                                                             within range
-    // ------------------------------------------------------------------------
-    else
-    {
-        // copy mantissa, add hidden bit
-        output = (output & _fltMantissaMask) | _fltHiddenBit;
-
-        INT32 extraBits = _fltMantissaBitCount - c_uFBits - unbiasedExponent;
-        if (extraBits >= 0)
-        {
-            // 2's complement if negative
-            if (isNegative)
-            {
-                output = ~output + 1;
-            }
-
-            // From the range checks that led here, it is known that
-            // unbiasedExponent < c_uIBits.  So, at most:
-            // (a) unbiasedExponent == c_uIBits - 1.
-            //
-            // From compile validation above, it is known that
-            // c_uIBits + c_uFBits <= _fltMantissaBitCount + 1).
-            // So, at minimum:
-            // (b) _fltMantissaBitCount == _fxtIntBitCount + c_uFBits - 1
-            //
-            // Substituting (a) and (b) into extraBits calculation above:
-            // extraBits >= (_fxtIntBitCount + c_uFBits - 1)
-            //              - c_uFBits - (c_uIBits - 1)
-            // extraBits >= 0
-            //
-            // Thus we only have to worry about shifting right by 0 or more
-            // bits to get the decimal to the right place, and never have
-            // to shift left.
-
-            INT32 LSB             = 1 << extraBits; // last bit being kept
-            INT32 extraBitsMask   = LSB - 1;
-            INT32 half            = LSB >> 1; // round bias
-
-            // round to nearest-even at LSB
-            if ((output & LSB) || (output & extraBitsMask) > half)
-            {
-                output += half;
-            }
-
-            // shift off the extra bits (sign extending)
-            output >>= extraBits;
-        }
-        else
-        {
-            output <<= -extraBits;
-
-            // 2's complement if negative
-            if (isNegative)
-            {
-                output = ~output + 1;
-            }
-        }
-    }
-    return output;
-}
-//-----------------------------------------------------------------------------------------------------------------------------
-
-#define FXP_INTEGER_BITS 15
-#define FXP_FRACTION_BITS 16
-#define FXP_FRACTION_MASK 0x0000ffff
-#define FXP_INTEGER_MASK 0x7fff0000
-#define FXP_THREE (3<<FXP_FRACTION_BITS)
-#define FXP_ONE (1<<FXP_FRACTION_BITS)
-#define FXP_ONE_THIRD 0x00005555
-#define FXP_TWO_THIRDS 0x0000aaaa
-#define FXP_ONE_HALF   0x00008000
-
-#define FXP_MAX_INPUT_TESS_FACTOR_BEFORE_TRIPLE_AVERAGE 0x55540000 // 1/3 of max fixed point number - 1.  Numbers less than
-                                                    // or equal to this allows avg. reduction on a tri patch
-                                                    // including rounding.
-
-#define FXP_MAX_INPUT_TESS_FACTOR_BEFORE_PAIR_AVERAGE 0x7FFF0000 // 1/2 of max fixed point number - 1.  Numbers less than
-                                                    // or equal to this allows avg. reduction on a quad patch
-                                                    // including rounding.
-
-static const FXP s_fixedReciprocal[D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR+1] =
-{
-    0xffffffff, // 1/0 is the first entry (unused)
-    0x10000, 0x8000, 0x5555, 0x4000,
-    0x3333, 0x2aab, 0x2492, 0x2000,
-    0x1c72, 0x199a, 0x1746, 0x1555,
-    0x13b1, 0x1249, 0x1111, 0x1000,
-    0xf0f, 0xe39, 0xd79, 0xccd,
-    0xc31, 0xba3, 0xb21, 0xaab,
-    0xa3d, 0x9d9, 0x97b, 0x925,
-    0x8d4, 0x889, 0x842, 0x800,
-    0x7c2, 0x788, 0x750, 0x71c,
-    0x6eb, 0x6bd, 0x690, 0x666,
-    0x63e, 0x618, 0x5f4, 0x5d1,
-    0x5b0, 0x591, 0x572, 0x555,
-    0x539, 0x51f, 0x505, 0x4ec,
-    0x4d5, 0x4be, 0x4a8, 0x492,
-    0x47e, 0x46a, 0x457, 0x444,
-    0x432, 0x421, 0x410, 0x400, // 1/64 is the last entry
-};
-
-#define FLOAT_THREE 3.0f
-#define FLOAT_ONE 1.0f
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// floatToFixed
-//---------------------------------------------------------------------------------------------------------------------------------
-FXP floatToFixed(const float& input)
-{
-    return floatToIDotF< FXP_INTEGER_BITS, FXP_FRACTION_BITS, /*bSigned*/false >( input );
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// fixedToFloat
-//---------------------------------------------------------------------------------------------------------------------------------
-float fixedToFloat(const FXP& input)
-{
-    // not worrying about denorm flushing the float operations (the DX spec behavior for div), since the numbers will not be that small during tessellation.
-    return ((float)(input>>FXP_FRACTION_BITS) + (float)(input&FXP_FRACTION_MASK)/(1<<FXP_FRACTION_BITS));
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// isEven
-//---------------------------------------------------------------------------------------------------------------------------------
-bool isEven(const float& input)
-{
-    return (((int)input) & 1) ? false : true;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// fxpCeil
-//---------------------------------------------------------------------------------------------------------------------------------
-FXP fxpCeil(const FXP& input)
-{
-    if( input & FXP_FRACTION_MASK )
-    {
-        return (input & FXP_INTEGER_MASK) + FXP_ONE;
-    }
-    return input;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// fxpFloor
-//---------------------------------------------------------------------------------------------------------------------------------
-FXP fxpFloor(const FXP& input)
-{
-    return (input & FXP_INTEGER_MASK);
-}
-
-//=================================================================================================================================
-// CHWTessellator
-//=================================================================================================================================
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::CHWTessellator
-//---------------------------------------------------------------------------------------------------------------------------------
-CHWTessellator::CHWTessellator()
-{
-    m_Point = 0;
-    m_Index = 0;
-    m_NumPoints = 0;
-    m_NumIndices = 0;
-    m_bUsingPatchedIndices = false;
-    m_bUsingPatchedIndices2 = false;
-#ifdef ALLOW_XBOX_360_COMPARISON
-	m_bXBox360Mode = false;
-#endif
-}
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::~CHWTessellator
-//---------------------------------------------------------------------------------------------------------------------------------
-CHWTessellator::~CHWTessellator()
-{
-    delete [] m_Point;
-    delete [] m_Index;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::Init
-// User calls this.
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::Init(
-    D3D11_TESSELLATOR_PARTITIONING       partitioning,
-    D3D11_TESSELLATOR_OUTPUT_PRIMITIVE   outputPrimitive)
-{
-    if( 0 == m_Point )
-    {
-        m_Point = new DOMAIN_POINT[MAX_POINT_COUNT];
-    }
-    if( 0 == m_Index )
-    {
-        m_Index = new int[MAX_INDEX_COUNT];
-    }
-    m_partitioning = partitioning;
-    m_originalPartitioning = partitioning;
-    switch( partitioning )
-    {
-    case D3D11_TESSELLATOR_PARTITIONING_INTEGER:
-    default:
-        break;
-    case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD:
-        m_parity = TESSELLATOR_PARITY_ODD;
-        break;
-    case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN:
-        m_parity = TESSELLATOR_PARITY_EVEN;
-        break;
-    }
-    m_originalParity = m_parity;
-    m_outputPrimitive = outputPrimitive;
-    m_NumPoints = 0;
-    m_NumIndices = 0;
-}
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::TessellateQuadDomain
-// User calls this
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::TessellateQuadDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1,
-                                         float insideTessFactor_U, float insideTessFactor_V )
-{
-    PROCESSED_TESS_FACTORS_QUAD processedTessFactors;
-    QuadProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Ueq1,tessFactor_Veq1,insideTessFactor_U,insideTessFactor_V,processedTessFactors);
-
-    if( processedTessFactors.bPatchCulled )
-    {
-        m_NumPoints = 0;
-        m_NumIndices = 0;
-        return;
-    }
-    else if( processedTessFactors.bJustDoMinimumTessFactor )
-    {
-        DefinePoint(/*U*/0,/*V*/0,/*pointStorageOffset*/0);
-        DefinePoint(/*U*/FXP_ONE,/*V*/0,/*pointStorageOffset*/1);
-        DefinePoint(/*U*/FXP_ONE,/*V*/FXP_ONE,/*pointStorageOffset*/2);
-        DefinePoint(/*U*/0,/*V*/FXP_ONE,/*pointStorageOffset*/3);
-        m_NumPoints = 4;
-
-        switch(m_outputPrimitive)
-        {
-        case D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW:
-        case D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW:
-            // function orients them CCW if needed
-            DefineClockwiseTriangle(0,1,3,/*indexStorageOffset*/0);
-            DefineClockwiseTriangle(1,2,3,/*indexStorageOffset*/3);
-            m_NumIndices = 6;
-            break;
-        case D3D11_TESSELLATOR_OUTPUT_POINT:
-            DumpAllPoints();
-            break;
-        case D3D11_TESSELLATOR_OUTPUT_LINE:
-            DumpAllPointsAsInOrderLineList();
-            break;
-        }
-        return;
-    }
-
-    QuadGeneratePoints(processedTessFactors);
-
-    if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT )
-    {
-        DumpAllPoints();
-        return;
-    }
-    if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_LINE )
-    {
-        DumpAllPointsAsInOrderLineList();
-        return;
-    }
-
-    QuadGenerateConnectivity(processedTessFactors); // can be done in parallel to QuadGeneratePoints()
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::QuadProcessTessFactors
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::QuadProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1,
-                      float insideTessFactor_U, float insideTessFactor_V, PROCESSED_TESS_FACTORS_QUAD& processedTessFactors )
-{
-    // Is the patch culled?
-    if( !(tessFactor_Ueq0 > 0) || // NaN will pass
-        !(tessFactor_Veq0 > 0) ||
-        !(tessFactor_Ueq1 > 0) ||
-        !(tessFactor_Veq1 > 0) )
-    {
-        processedTessFactors.bPatchCulled = true;
-        return;
-    }
-    else
-    {
-        processedTessFactors.bPatchCulled = false;
-    }
-
-    // Clamp edge TessFactors
-    float lowerBound = 0.0, upperBound = 0.0;
-    switch(m_originalPartitioning)
-    {
-        case D3D11_TESSELLATOR_PARTITIONING_INTEGER:
-        case D3D11_TESSELLATOR_PARTITIONING_POW2: // don�t care about pow2 distinction for validation, just treat as integer
-            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR;
-            upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR;
-            break;
-
-        case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN:
-            lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR;
-            upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR;
-            break;
-
-        case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD:
-            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR;
-            upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR;
-            break;
-    }
-
-    tessFactor_Ueq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Ueq0 ) );
-    tessFactor_Veq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Veq0 ) );
-    tessFactor_Ueq1 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Ueq1 ) );
-    tessFactor_Veq1 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Veq1 ) );
-
-    if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction)
-    {
-        tessFactor_Ueq0 = ceil(tessFactor_Ueq0);
-        tessFactor_Veq0 = ceil(tessFactor_Veq0);
-        tessFactor_Ueq1 = ceil(tessFactor_Ueq1);
-        tessFactor_Veq1 = ceil(tessFactor_Veq1);
-    }
-
-    // Clamp inside TessFactors
-    if(D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD == m_originalPartitioning)
-    {
-#define EPSILON 0.0000152587890625f // 2^(-16), min positive fixed point fraction
-#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON (D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON/2)
-        // If any TessFactor will end up > 1 after floatToFixed conversion later,
-        // then force the inside TessFactors to be > 1 so there is a picture frame.
-        if( (tessFactor_Ueq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
-            (tessFactor_Veq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
-            (tessFactor_Ueq1 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
-            (tessFactor_Veq1 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
-            (insideTessFactor_U > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
-            (insideTessFactor_V > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) )
-        {
-            // Force picture frame
-            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON;
-        }
-    }
-
-    insideTessFactor_U = tess_fmin( upperBound, tess_fmax( lowerBound, insideTessFactor_U ) );
-    insideTessFactor_V = tess_fmin( upperBound, tess_fmax( lowerBound, insideTessFactor_V ) );
-    // Note the above clamps map NaN to lowerBound
-
-
-    if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction)
-    {
-        insideTessFactor_U = ceil(insideTessFactor_U);
-        insideTessFactor_V = ceil(insideTessFactor_V);
-    }
-
-    // Reset our vertex and index buffers.  We have enough storage for the max tessFactor.
-    m_NumPoints = 0;
-    m_NumIndices = 0;
-
-    // Process tessFactors
-    float outsideTessFactor[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Ueq1, tessFactor_Veq1};
-    float insideTessFactor[QUAD_AXES] = {insideTessFactor_U,insideTessFactor_V};
-    int edge, axis;
-    if( HWIntegerPartitioning() )
-    {
-        for( edge = 0; edge < QUAD_EDGES; edge++ )
-        {
-            int edgeEven = isEven(outsideTessFactor[edge]);
-            processedTessFactors.outsideTessFactorParity[edge] = edgeEven ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-        }
-        for( axis = 0; axis < QUAD_AXES; axis++ )
-        {
-            processedTessFactors.insideTessFactorParity[axis] =
-                (isEven(insideTessFactor[axis]) || (FLOAT_ONE == insideTessFactor[axis]) )
-                ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-        }
-    }
-    else
-    {
-        for( edge = 0; edge < QUAD_EDGES; edge++ )
-        {
-            processedTessFactors.outsideTessFactorParity[edge] = m_originalParity;
-        }
-        processedTessFactors.insideTessFactorParity[U] = processedTessFactors.insideTessFactorParity[V] = m_originalParity;
-    }
-
-    // Save fixed point TessFactors
-    for( edge = 0; edge < QUAD_EDGES; edge++ )
-    {
-        processedTessFactors.outsideTessFactor[edge] = floatToFixed(outsideTessFactor[edge]);
-    }
-    for( axis = 0; axis < QUAD_AXES; axis++ )
-    {
-        processedTessFactors.insideTessFactor[axis] = floatToFixed(insideTessFactor[axis]);
-    }
-
-    if( HWIntegerPartitioning() || Odd() )
-    {
-        // Special case if all TessFactors are 1
-        if( (FXP_ONE == processedTessFactors.insideTessFactor[U]) &&
-            (FXP_ONE == processedTessFactors.insideTessFactor[V]) &&
-            (FXP_ONE == processedTessFactors.outsideTessFactor[Ueq0]) &&
-            (FXP_ONE == processedTessFactors.outsideTessFactor[Veq0]) &&
-            (FXP_ONE == processedTessFactors.outsideTessFactor[Ueq1]) &&
-            (FXP_ONE == processedTessFactors.outsideTessFactor[Veq1]) )
-        {
-            processedTessFactors.bJustDoMinimumTessFactor = true;
-            return;
-        }
-    }
-    processedTessFactors.bJustDoMinimumTessFactor = false;
-
-    // Compute TessFactor-specific metadata
-    for(int edge = 0; edge < QUAD_EDGES; edge++ )
-    {
-        SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]);
-        ComputeTessFactorContext(processedTessFactors.outsideTessFactor[edge], processedTessFactors.outsideTessFactorCtx[edge]);
-    }
-
-    for(int axis = 0; axis < QUAD_AXES; axis++)
-    {
-        SetTessellationParity(processedTessFactors.insideTessFactorParity[axis]);
-        ComputeTessFactorContext(processedTessFactors.insideTessFactor[axis], processedTessFactors.insideTessFactorCtx[axis]);
-    }
-
-    // Compute some initial data.
-
-    // outside edge offsets and storage
-    for(int edge = 0; edge < QUAD_EDGES; edge++ )
-    {
-        SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]);
-        processedTessFactors.numPointsForOutsideEdge[edge] = NumPointsForTessFactor(processedTessFactors.outsideTessFactor[edge]);
-        m_NumPoints += processedTessFactors.numPointsForOutsideEdge[edge];
-    }
-    m_NumPoints -= 4;
-
-    // inside edge offsets
-    for(int axis = 0; axis < QUAD_AXES; axis++)
-    {
-        SetTessellationParity(processedTessFactors.insideTessFactorParity[axis]);
-        processedTessFactors.numPointsForInsideTessFactor[axis] = NumPointsForTessFactor(processedTessFactors.insideTessFactor[axis]);
-        int pointCountMin = ( TESSELLATOR_PARITY_ODD == processedTessFactors.insideTessFactorParity[axis] ) ? 4 : 3;
-        // max() allows degenerate transition regions when inside TessFactor == 1
-        processedTessFactors.numPointsForInsideTessFactor[axis] = max(pointCountMin,processedTessFactors.numPointsForInsideTessFactor[axis]);
-    }
-
-    processedTessFactors.insideEdgePointBaseOffset = m_NumPoints;
-
-    // inside storage, including interior edges above
-    int numInteriorPoints = (processedTessFactors.numPointsForInsideTessFactor[U] - 2)*(processedTessFactors.numPointsForInsideTessFactor[V]-2);
-    m_NumPoints += numInteriorPoints;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::QuadGeneratePoints
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::QuadGeneratePoints( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors )
-{
-    // Generate exterior ring edge points, clockwise from top-left
-    int pointOffset = 0;
-    int edge;
-    for(edge = 0; edge < QUAD_EDGES; edge++ )
-    {
-        int parity = edge&0x1;
-        int startPoint = 0;
-        int endPoint = processedTessFactors.numPointsForOutsideEdge[edge] - 1;
-        for(int p = startPoint; p < endPoint; p++,pointOffset++) // don't include end, since next edge starts with it.
-        {
-            FXP fxpParam;
-            int q = ((edge==1)||(edge==2)) ? p : endPoint - p; // reverse order
-            SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]);
-            PlacePointIn1D(processedTessFactors.outsideTessFactorCtx[edge],q,fxpParam);
-            if( parity )
-            {
-                DefinePoint(/*U*/fxpParam,
-                            /*V*/(edge == 3) ? FXP_ONE : 0,
-                            /*pointStorageOffset*/pointOffset);
-            }
-            else
-            {
-                DefinePoint(/*U*/(edge == 2) ? FXP_ONE : 0,
-                            /*V*/fxpParam,
-                            /*pointStorageOffset*/pointOffset);
-            }
-        }
-    }
-
-    // Generate interior ring points, clockwise from (U==0,V==1) (bottom-left) spiralling toward center
-    static const int startRing = 1;
-    int minNumPointsForTessFactor = min(processedTessFactors.numPointsForInsideTessFactor[U],processedTessFactors.numPointsForInsideTessFactor[V]);
-    int numRings = (minNumPointsForTessFactor >> 1);  // note for even tess we aren't counting center point here.
-    for(int ring = startRing; ring < numRings; ring++)
-    {
-        int startPoint = ring;
-        int endPoint[QUAD_AXES] = {processedTessFactors.numPointsForInsideTessFactor[U] - 1 - startPoint,
-                                   processedTessFactors.numPointsForInsideTessFactor[V] - 1 - startPoint};
-
-        for(edge = 0; edge < QUAD_EDGES; edge++ )
-        {
-            int parity[QUAD_AXES] = {edge&0x1,((edge+1)&0x1)};
-            int perpendicularAxisPoint = (edge < 2) ? startPoint : endPoint[parity[0]];
-            FXP fxpPerpParam;
-            SetTessellationParity(processedTessFactors.insideTessFactorParity[parity[0]]);
-            PlacePointIn1D(processedTessFactors.insideTessFactorCtx[parity[0]],perpendicularAxisPoint,fxpPerpParam);
-            SetTessellationParity(processedTessFactors.insideTessFactorParity[parity[1]]);
-            for(int p = startPoint; p < endPoint[parity[1]]; p++, pointOffset++) // don't include end: next edge starts with it.
-            {
-                FXP fxpParam;
-                int q = ((edge == 1)||(edge==2)) ? p : endPoint[parity[1]] - (p - startPoint);
-                PlacePointIn1D(processedTessFactors.insideTessFactorCtx[parity[1]],q,fxpParam);
-                if( parity[1] )
-                {
-                    DefinePoint(/*U*/fxpPerpParam,
-                                /*V*/fxpParam,
-                                /*pointStorageOffset*/pointOffset);
-                }
-                else
-                {
-                    DefinePoint(/*U*/fxpParam,
-                                /*V*/fxpPerpParam,
-                                /*pointStorageOffset*/pointOffset);
-                }
-            }
-        }
-    }
-    // For even tessellation, the inner "ring" is degenerate - a row of points
-    if( (processedTessFactors.numPointsForInsideTessFactor[U] > processedTessFactors.numPointsForInsideTessFactor[V]) &&
-        (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V]) )
-    {
-        int startPoint = numRings;
-        int endPoint = processedTessFactors.numPointsForInsideTessFactor[U] - 1 - startPoint;
-        SetTessellationParity(processedTessFactors.insideTessFactorParity[U]);
-        for( int p = startPoint; p <= endPoint; p++, pointOffset++ )
-        {
-            FXP fxpParam;
-            PlacePointIn1D(processedTessFactors.insideTessFactorCtx[U],p,fxpParam);
-            DefinePoint(/*U*/fxpParam,
-                        /*V*/FXP_ONE_HALF, // middle
-                        /*pointStorageOffset*/pointOffset);
-        }
-    }
-    else if( (processedTessFactors.numPointsForInsideTessFactor[V] >= processedTessFactors.numPointsForInsideTessFactor[U]) &&
-             (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[U]) )
-    {
-        int startPoint = numRings;
-        int endPoint;
-        FXP fxpParam;
-        endPoint = processedTessFactors.numPointsForInsideTessFactor[V] - 1 - startPoint;
-        SetTessellationParity(processedTessFactors.insideTessFactorParity[V]);
-        for( int p = endPoint; p >= startPoint; p--, pointOffset++ )
-        {
-            PlacePointIn1D(processedTessFactors.insideTessFactorCtx[V],p,fxpParam);
-            DefinePoint(/*U*/FXP_ONE_HALF, // middle
-                        /*V*/fxpParam,
-                        /*pointStorageOffset*/pointOffset);
-        }
-    }
-}
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::QuadGenerateConnectivity
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::QuadGenerateConnectivity( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors )
-{
-    // Generate primitives for all the concentric rings, one side at a time for each ring
-    static const int startRing = 1;
-    int numPointRowsToCenter[QUAD_AXES] = {((processedTessFactors.numPointsForInsideTessFactor[U]+1) >> 1),
-                                            ((processedTessFactors.numPointsForInsideTessFactor[V]+1) >> 1)}; // +1 is so even tess includes the center point
-    int numRings = min(numPointRowsToCenter[U],numPointRowsToCenter[V]);
-    int degeneratePointRing[QUAD_AXES] = { // Even partitioning causes degenerate row of points,
-                                           // which results in exceptions to the point ordering conventions
-                                           // when travelling around the rings counterclockwise.
-        (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V]) ? numPointRowsToCenter[V] - 1 : -1,
-        (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[U]) ? numPointRowsToCenter[U] - 1 : -1 };
-
-    const TESS_FACTOR_CONTEXT* outsideTessFactorCtx[QUAD_EDGES] = {&processedTessFactors.outsideTessFactorCtx[Ueq0],
-                                                    &processedTessFactors.outsideTessFactorCtx[Veq0],
-                                                    &processedTessFactors.outsideTessFactorCtx[Ueq1],
-                                                    &processedTessFactors.outsideTessFactorCtx[Veq1]};
-    TESSELLATOR_PARITY outsideTessFactorParity[QUAD_EDGES] = {processedTessFactors.outsideTessFactorParity[Ueq0],
-                                                        processedTessFactors.outsideTessFactorParity[Veq0],
-                                                        processedTessFactors.outsideTessFactorParity[Ueq1],
-                                                        processedTessFactors.outsideTessFactorParity[Veq1]};
-    int numPointsForOutsideEdge[QUAD_EDGES] = {processedTessFactors.numPointsForOutsideEdge[Ueq0],
-                                              processedTessFactors.numPointsForOutsideEdge[Veq0],
-                                              processedTessFactors.numPointsForOutsideEdge[Ueq1],
-                                              processedTessFactors.numPointsForOutsideEdge[Veq1]};
-
-    int insideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset;
-    int outsideEdgePointBaseOffset = 0;
-    int edge;
-    for(int ring = startRing; ring < numRings; ring++)
-    {
-        int numPointsForInsideEdge[QUAD_AXES] = {processedTessFactors.numPointsForInsideTessFactor[U] - 2*ring,
-                                                 processedTessFactors.numPointsForInsideTessFactor[V] - 2*ring};
-
-        int edge0InsidePointBaseOffset = insideEdgePointBaseOffset;
-        int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset;
-
-        for(edge = 0; edge < QUAD_EDGES; edge++ )
-        {
-            int parity = (edge+1)&0x1;
-
-            int numTriangles = numPointsForInsideEdge[parity] + numPointsForOutsideEdge[edge] - 2;
-            int insideBaseOffset;
-            int outsideBaseOffset;
-            if( edge == 3 ) // We need to patch the indexing so Stitch() can think it sees
-                            // 2 sequentially increasing rows of points, even though we have wrapped around
-                            // to the end of the inner and outer ring's points, so the last point is really
-                            // the first point for the ring.
-                            // We make it so that when Stitch() calls AddIndex(), that function
-                            // will do any necessary index adjustment.
-            {
-                if( ring == degeneratePointRing[parity] )
-                {
-                    m_IndexPatchContext2.baseIndexToInvert = insideEdgePointBaseOffset + 1;
-                    m_IndexPatchContext2.cornerCaseBadValue = outsideEdgePointBaseOffset + numPointsForOutsideEdge[edge] - 1;
-                    m_IndexPatchContext2.cornerCaseReplacementValue = edge0OutsidePointBaseOffset;
-                    m_IndexPatchContext2.indexInversionEndPoint = (m_IndexPatchContext2.baseIndexToInvert << 1) - 1;
-                    insideBaseOffset = m_IndexPatchContext2.baseIndexToInvert;
-                    outsideBaseOffset = outsideEdgePointBaseOffset;
-                    SetUsingPatchedIndices2(true);
-                }
-                else
-                {
-                    m_IndexPatchContext.insidePointIndexDeltaToRealValue    = insideEdgePointBaseOffset;
-                    m_IndexPatchContext.insidePointIndexBadValue            = numPointsForInsideEdge[parity] - 1;
-                    m_IndexPatchContext.insidePointIndexReplacementValue    = edge0InsidePointBaseOffset;
-                    m_IndexPatchContext.outsidePointIndexPatchBase          = m_IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range
-                    m_IndexPatchContext.outsidePointIndexDeltaToRealValue   = outsideEdgePointBaseOffset
-                                                                                - m_IndexPatchContext.outsidePointIndexPatchBase;
-                    m_IndexPatchContext.outsidePointIndexBadValue           = m_IndexPatchContext.outsidePointIndexPatchBase
-                                                                                + numPointsForOutsideEdge[edge] - 1;
-                    m_IndexPatchContext.outsidePointIndexReplacementValue   = edge0OutsidePointBaseOffset;
-
-                    insideBaseOffset = 0;
-                    outsideBaseOffset = m_IndexPatchContext.outsidePointIndexPatchBase;
-                    SetUsingPatchedIndices(true);
-                }
-            }
-            else if( (edge == 2) && (ring == degeneratePointRing[parity]) )
-            {
-                m_IndexPatchContext2.baseIndexToInvert = insideEdgePointBaseOffset;
-                m_IndexPatchContext2.cornerCaseBadValue = -1; // unused
-                m_IndexPatchContext2.cornerCaseReplacementValue = -1; // unused
-                m_IndexPatchContext2.indexInversionEndPoint = m_IndexPatchContext2.baseIndexToInvert << 1;
-                insideBaseOffset = m_IndexPatchContext2.baseIndexToInvert;
-                outsideBaseOffset = outsideEdgePointBaseOffset;
-                SetUsingPatchedIndices2(true);
-            }
-            else
-            {
-                insideBaseOffset = insideEdgePointBaseOffset;
-                outsideBaseOffset = outsideEdgePointBaseOffset;
-            }
-            if( ring == startRing )
-            {
-                StitchTransition(/*baseIndexOffset: */m_NumIndices,
-                               insideBaseOffset,processedTessFactors.insideTessFactorCtx[parity].numHalfTessFactorPoints,processedTessFactors.insideTessFactorParity[parity],
-                               outsideBaseOffset,outsideTessFactorCtx[edge]->numHalfTessFactorPoints,outsideTessFactorParity[edge]);
-            }
-            else
-            {
-                StitchRegular(/*bTrapezoid*/true, DIAGONALS_MIRRORED,
-                              /*baseIndexOffset: */m_NumIndices,
-                              numPointsForInsideEdge[parity],
-                              insideBaseOffset,outsideBaseOffset);
-            }
-            SetUsingPatchedIndices(false);
-            SetUsingPatchedIndices2(false);
-            m_NumIndices += numTriangles*3;
-            outsideEdgePointBaseOffset += numPointsForOutsideEdge[edge] - 1;
-            if( (edge == 2) && (ring == degeneratePointRing[parity]) )
-            {
-                insideEdgePointBaseOffset -= numPointsForInsideEdge[parity] - 1;
-            }
-            else
-            {
-                insideEdgePointBaseOffset += numPointsForInsideEdge[parity] - 1;
-            }
-            numPointsForOutsideEdge[edge] = numPointsForInsideEdge[parity];
-        }
-        if( startRing == ring )
-        {
-            for(edge = 0; edge < QUAD_EDGES; edge++ )
-            {
-                outsideTessFactorCtx[edge] = &processedTessFactors.insideTessFactorCtx[edge&1];
-                outsideTessFactorParity[edge] = processedTessFactors.insideTessFactorParity[edge&1];
-            }
-        }
-    }
-
-    // Triangulate center - a row of quads if odd
-    // This triangulation may be producing diagonals that are asymmetric about
-    // the center of the patch in this region.
-    if( (processedTessFactors.numPointsForInsideTessFactor[U] > processedTessFactors.numPointsForInsideTessFactor[V]) &&
-        (TESSELLATOR_PARITY_ODD == processedTessFactors.insideTessFactorParity[V] ) )
-    {
-        SetUsingPatchedIndices2(true);
-        int stripNumQuads = (((processedTessFactors.numPointsForInsideTessFactor[U]>>1) - (processedTessFactors.numPointsForInsideTessFactor[V]>>1))<<1)+
-                            ((TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[U] ) ? 2 : 1);
-        m_IndexPatchContext2.baseIndexToInvert = outsideEdgePointBaseOffset + stripNumQuads + 2;
-        m_IndexPatchContext2.cornerCaseBadValue = m_IndexPatchContext2.baseIndexToInvert;
-        m_IndexPatchContext2.cornerCaseReplacementValue = outsideEdgePointBaseOffset;
-        m_IndexPatchContext2.indexInversionEndPoint = m_IndexPatchContext2.baseIndexToInvert +
-                                                      m_IndexPatchContext2.baseIndexToInvert + stripNumQuads;
-        StitchRegular(/*bTrapezoid*/false,DIAGONALS_INSIDE_TO_OUTSIDE,
-                       /*baseIndexOffset: */m_NumIndices, /*numInsideEdgePoints:*/stripNumQuads+1,
-                       /*insideEdgePointBaseOffset*/m_IndexPatchContext2.baseIndexToInvert,
-                       outsideEdgePointBaseOffset+1);
-        SetUsingPatchedIndices2(false);
-        m_NumIndices += stripNumQuads*6;
-    }
-    else if((processedTessFactors.numPointsForInsideTessFactor[V] >= processedTessFactors.numPointsForInsideTessFactor[U]) &&
-            (TESSELLATOR_PARITY_ODD == processedTessFactors.insideTessFactorParity[U]) )
-    {
-        SetUsingPatchedIndices2(true);
-        int stripNumQuads = (((processedTessFactors.numPointsForInsideTessFactor[V]>>1) - (processedTessFactors.numPointsForInsideTessFactor[U]>>1))<<1)+
-                            ((TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V] ) ? 2 : 1);
-        m_IndexPatchContext2.baseIndexToInvert = outsideEdgePointBaseOffset + stripNumQuads + 1;
-        m_IndexPatchContext2.cornerCaseBadValue = -1; // unused
-        m_IndexPatchContext2.indexInversionEndPoint = m_IndexPatchContext2.baseIndexToInvert +
-                                                      m_IndexPatchContext2.baseIndexToInvert + stripNumQuads;
-		DIAGONALS diag = (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V]) ?
-							DIAGONALS_INSIDE_TO_OUTSIDE : DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE;
-        StitchRegular(/*bTrapezoid*/false,diag,
-                       /*baseIndexOffset: */m_NumIndices, /*numInsideEdgePoints:*/stripNumQuads+1,
-                       /*insideEdgePointBaseOffset*/m_IndexPatchContext2.baseIndexToInvert,
-                       outsideEdgePointBaseOffset);
-        SetUsingPatchedIndices2(false);
-        m_NumIndices += stripNumQuads*6;
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::TessellateTriDomain
-// User calls this
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::TessellateTriDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0,
-                                        float insideTessFactor )
-{
-    PROCESSED_TESS_FACTORS_TRI processedTessFactors;
-    TriProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Weq0,insideTessFactor,processedTessFactors);
-
-    if( processedTessFactors.bPatchCulled )
-    {
-        m_NumPoints = 0;
-        m_NumIndices = 0;
-        return;
-    }
-    else if( processedTessFactors.bJustDoMinimumTessFactor )
-    {
-        DefinePoint(/*U*/0,/*V*/FXP_ONE,/*pointStorageOffset*/0); //V=1 (beginning of Ueq0 edge VW)
-        DefinePoint(/*U*/0,/*V*/0,/*pointStorageOffset*/1); //W=1 (beginning of Veq0 edge WU)
-        DefinePoint(/*U*/FXP_ONE,/*V*/0,/*pointStorageOffset*/2); //U=1 (beginning of Weq0 edge UV)
-        m_NumPoints = 3;
-
-        switch(m_outputPrimitive)
-        {
-        case D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW:
-        case D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW:
-            // function orients them CCW if needed
-            DefineClockwiseTriangle(0,1,2,/*indexStorageBaseOffset*/m_NumIndices);
-            m_NumIndices = 3;
-            break;
-        case D3D11_TESSELLATOR_OUTPUT_POINT:
-            DumpAllPoints();
-            break;
-        case D3D11_TESSELLATOR_OUTPUT_LINE:
-            DumpAllPointsAsInOrderLineList();
-            break;
-        }
-        return;
-    }
-
-    TriGeneratePoints(processedTessFactors);
-
-    if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT )
-    {
-        DumpAllPoints();
-        return;
-    }
-    if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_LINE )
-    {
-        DumpAllPointsAsInOrderLineList();
-        return;
-    }
-
-    TriGenerateConnectivity(processedTessFactors); // can be done in parallel to TriGeneratePoints()
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::TriProcessTessFactors
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::TriProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0,
-                                            float insideTessFactor, PROCESSED_TESS_FACTORS_TRI& processedTessFactors )
-{
-    // Is the patch culled?
-    if( !(tessFactor_Ueq0 > 0) || // NaN will pass
-        !(tessFactor_Veq0 > 0) ||
-        !(tessFactor_Weq0 > 0) )
-    {
-        processedTessFactors.bPatchCulled = true;
-        return;
-    }
-    else
-    {
-        processedTessFactors.bPatchCulled = false;
-    }
-
-    // Clamp edge TessFactors
-    float lowerBound = 0.0, upperBound = 0.0;
-    switch(m_originalPartitioning)
-    {
-        case D3D11_TESSELLATOR_PARTITIONING_INTEGER:
-        case D3D11_TESSELLATOR_PARTITIONING_POW2: // don�t care about pow2 distinction for validation, just treat as integer
-            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR;
-            upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR;
-            break;
-
-        case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN:
-            lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR;
-            upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR;
-            break;
-
-        case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD:
-            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR;
-            upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR;
-            break;
-    }
-
-    tessFactor_Ueq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Ueq0 ) );
-    tessFactor_Veq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Veq0 ) );
-    tessFactor_Weq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Weq0 ) );
-
-    if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction)
-    {
-        tessFactor_Ueq0 = ceil(tessFactor_Ueq0);
-        tessFactor_Veq0 = ceil(tessFactor_Veq0);
-        tessFactor_Weq0 = ceil(tessFactor_Weq0);
-    }
-
-    // Clamp inside TessFactors
-    if(D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD == m_originalPartitioning)
-    {
-        if( (tessFactor_Ueq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
-            (tessFactor_Veq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
-            (tessFactor_Weq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON))
-            // Don't need the same check for insideTessFactor for tri patches,
-            // since there is only one insideTessFactor, as opposed to quad
-            // patches which have 2 insideTessFactors.
-        {
-            // Force picture frame
-            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON;
-        }
-    }
-
-    insideTessFactor = tess_fmin( upperBound, tess_fmax( lowerBound, insideTessFactor ) );
-    // Note the above clamps map NaN to lowerBound
-
-    if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction)
-    {
-        insideTessFactor = ceil(insideTessFactor);
-    }
-
-    // Reset our vertex and index buffers.  We have enough storage for the max tessFactor.
-    m_NumPoints = 0;
-    m_NumIndices = 0;
-
-    // Process tessFactors
-    float outsideTessFactor[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Weq0};
-    int edge;
-    if( HWIntegerPartitioning() )
-    {
-        for( edge = 0; edge < TRI_EDGES; edge++ )
-        {
-            int edgeEven = isEven(outsideTessFactor[edge]);
-            processedTessFactors.outsideTessFactorParity[edge] = edgeEven ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-        }
-        processedTessFactors.insideTessFactorParity = (isEven(insideTessFactor) || (FLOAT_ONE == insideTessFactor))
-                                        ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-    }
-    else
-    {
-        for( edge = 0; edge < TRI_EDGES; edge++ )
-        {
-            processedTessFactors.outsideTessFactorParity[edge] = m_originalParity;
-        }
-        processedTessFactors.insideTessFactorParity = m_originalParity;
-    }
-
-    // Save fixed point TessFactors
-    for( edge = 0; edge < TRI_EDGES; edge++ )
-    {
-        processedTessFactors.outsideTessFactor[edge] = floatToFixed(outsideTessFactor[edge]);
-    }
-    processedTessFactors.insideTessFactor = floatToFixed(insideTessFactor);
-
-    if( HWIntegerPartitioning() || Odd() )
-    {
-        // Special case if all TessFactors are 1
-        if( (FXP_ONE == processedTessFactors.insideTessFactor) &&
-            (FXP_ONE == processedTessFactors.outsideTessFactor[Ueq0]) &&
-            (FXP_ONE == processedTessFactors.outsideTessFactor[Veq0]) &&
-            (FXP_ONE == processedTessFactors.outsideTessFactor[Weq0]) )
-        {
-            processedTessFactors.bJustDoMinimumTessFactor = true;
-            return;
-        }
-    }
-    processedTessFactors.bJustDoMinimumTessFactor = false;
-
-    // Compute per-TessFactor metadata
-    for(edge = 0; edge < TRI_EDGES; edge++ )
-    {
-        SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]);
-        ComputeTessFactorContext(processedTessFactors.outsideTessFactor[edge], processedTessFactors.outsideTessFactorCtx[edge]);
-    }
-    SetTessellationParity(processedTessFactors.insideTessFactorParity);
-    ComputeTessFactorContext(processedTessFactors.insideTessFactor, processedTessFactors.insideTessFactorCtx);
-
-    // Compute some initial data.
-
-    // outside edge offsets and storage
-    for(edge = 0; edge < TRI_EDGES; edge++ )
-    {
-        SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]);
-        processedTessFactors.numPointsForOutsideEdge[edge] = NumPointsForTessFactor(processedTessFactors.outsideTessFactor[edge]);
-        m_NumPoints += processedTessFactors.numPointsForOutsideEdge[edge];
-    }
-    m_NumPoints -= 3;
-
-    // inside edge offsets
-    SetTessellationParity(processedTessFactors.insideTessFactorParity);
-    processedTessFactors.numPointsForInsideTessFactor = NumPointsForTessFactor(processedTessFactors.insideTessFactor);
-    {
-        int pointCountMin = Odd() ? 4 : 3;
-        // max() allows degenerate transition regions when inside TessFactor == 1
-        processedTessFactors.numPointsForInsideTessFactor = max(pointCountMin,processedTessFactors.numPointsForInsideTessFactor);
-    }
-
-    processedTessFactors.insideEdgePointBaseOffset = m_NumPoints;
-
-    // inside storage, including interior edges above
-    {
-        int numInteriorRings = (processedTessFactors.numPointsForInsideTessFactor >> 1) - 1;
-        int numInteriorPoints;
-        if( Odd() )
-        {
-            numInteriorPoints = TRI_EDGES*(numInteriorRings*(numInteriorRings+1) - numInteriorRings);
-        }
-        else
-        {
-            numInteriorPoints = TRI_EDGES*(numInteriorRings*(numInteriorRings+1)) + 1;
-        }
-        m_NumPoints += numInteriorPoints;
-    }
-
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::TriGeneratePoints
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::TriGeneratePoints( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors )
-{
-    // Generate exterior ring edge points, clockwise starting from point V (VW, the U==0 edge)
-    int pointOffset = 0;
-    int edge;
-    for(edge = 0; edge < TRI_EDGES; edge++ )
-    {
-        int parity = edge&0x1;
-        int startPoint = 0;
-        int endPoint = processedTessFactors.numPointsForOutsideEdge[edge] - 1;
-        for(int p = startPoint; p < endPoint; p++, pointOffset++) // don't include end, since next edge starts with it.
-        {
-            FXP fxpParam;
-            int q = (parity) ? p : endPoint - p; // whether to reverse point order given we are defining V or U (W implicit):
-                                                 // edge0, VW, has V decreasing, so reverse 1D points below
-                                                 // edge1, WU, has U increasing, so don't reverse 1D points  below
-                                                 // edge2, UV, has U decreasing, so reverse 1D points below
-            SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]);
-            PlacePointIn1D(processedTessFactors.outsideTessFactorCtx[edge],q,fxpParam);
-            if( edge == 0 )
-            {
-                DefinePoint(/*U*/0,
-                            /*V*/fxpParam,
-                            /*pointStorageOffset*/pointOffset);
-            }
-            else
-            {
-                DefinePoint(/*U*/fxpParam,
-                            /*V*/(edge == 2) ? FXP_ONE - fxpParam : 0,
-                            /*pointStorageOffset*/pointOffset);
-            }
-        }
-    }
-
-    // Generate interior ring points, clockwise spiralling in
-    SetTessellationParity(processedTessFactors.insideTessFactorParity);
-    static const int startRing = 1;
-    int numRings = (processedTessFactors.numPointsForInsideTessFactor >> 1);
-    for(int ring = startRing; ring < numRings; ring++)
-    {
-        int startPoint = ring;
-        int endPoint = processedTessFactors.numPointsForInsideTessFactor - 1 - startPoint;
-
-        for(edge = 0; edge < TRI_EDGES; edge++ )
-        {
-            int parity = edge&0x1;
-            int perpendicularAxisPoint = startPoint;
-            FXP fxpPerpParam;
-            PlacePointIn1D(processedTessFactors.insideTessFactorCtx,perpendicularAxisPoint,fxpPerpParam);
-            fxpPerpParam *= FXP_TWO_THIRDS; // Map location to the right size in barycentric space.
-                                         // I (amarp) can draw a picture to explain.
-                                         // We know this fixed point math won't over/underflow
-            fxpPerpParam = (fxpPerpParam+FXP_ONE_HALF/*round*/)>>FXP_FRACTION_BITS; // get back to n.16
-            for(int p = startPoint; p < endPoint; p++, pointOffset++) // don't include end: next edge starts with it.
-            {
-                FXP fxpParam;
-                int q = (parity) ? p : endPoint - (p - startPoint); // whether to reverse point given we are defining V or U (W implicit):
-                                                         // edge0, VW, has V decreasing, so reverse 1D points below
-                                                         // edge1, WU, has U increasing, so don't reverse 1D points  below
-                                                         // edge2, UV, has U decreasing, so reverse 1D points below
-                PlacePointIn1D(processedTessFactors.insideTessFactorCtx,q,fxpParam);
-                // edge0 VW, has perpendicular parameter U constant
-                // edge1 WU, has perpendicular parameter V constant
-                // edge2 UV, has perpendicular parameter W constant
-                const unsigned int deriv = 2; // reciprocal is the rate of change of edge-parallel parameters as they are pushed into the triangle
-                switch(edge)
-                {
-                case 0:
-                    DefinePoint(/*U*/fxpPerpParam,
-                                /*V*/fxpParam - (fxpPerpParam+1/*round*/)/deriv, // we know this fixed point math won't over/underflow
-                                /*pointStorageOffset*/pointOffset);
-                    break;
-                case 1:
-                    DefinePoint(/*U*/fxpParam - (fxpPerpParam+1/*round*/)/deriv,// we know this fixed point math won't over/underflow
-                                /*V*/fxpPerpParam,
-                                /*pointStorageOffset*/pointOffset);
-                    break;
-                case 2:
-                    DefinePoint(/*U*/fxpParam - (fxpPerpParam+1/*round*/)/deriv,// we know this fixed point math won't over/underflow
-                                /*V*/FXP_ONE - (fxpParam - (fxpPerpParam+1/*round*/)/deriv) - fxpPerpParam,// we know this fixed point math won't over/underflow
-                                /*pointStorageOffset*/pointOffset);
-                    break;
-                }
-            }
-        }
-    }
-    if( !Odd() )
-    {
-        // Last point is the point at the center.
-        DefinePoint(/*U*/FXP_ONE_THIRD,
-                    /*V*/FXP_ONE_THIRD,
-                    /*pointStorageOffset*/pointOffset);
-    }
-}
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::TriGenerateConnectivity
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::TriGenerateConnectivity( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors )
-{
-    // Generate primitives for all the concentric rings, one side at a time for each ring
-    static const int startRing = 1;
-    int numRings = ((processedTessFactors.numPointsForInsideTessFactor+1) >> 1); // +1 is so even tess includes the center point, which we want to now
-    const TESS_FACTOR_CONTEXT* outsideTessFactorCtx[TRI_EDGES] = {&processedTessFactors.outsideTessFactorCtx[Ueq0],
-                                            &processedTessFactors.outsideTessFactorCtx[Veq0],
-                                            &processedTessFactors.outsideTessFactorCtx[Weq0]};
-    TESSELLATOR_PARITY outsideTessFactorParity[TRI_EDGES] = {processedTessFactors.outsideTessFactorParity[Ueq0],
-                                            processedTessFactors.outsideTessFactorParity[Veq0],
-                                            processedTessFactors.outsideTessFactorParity[Weq0]};
-    int numPointsForOutsideEdge[TRI_EDGES] = {processedTessFactors.numPointsForOutsideEdge[Ueq0],
-                                              processedTessFactors.numPointsForOutsideEdge[Veq0],
-                                              processedTessFactors.numPointsForOutsideEdge[Weq0]};
-
-    int insideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset;
-    int outsideEdgePointBaseOffset = 0;
-    int edge;
-    for(int ring = startRing; ring < numRings; ring++)
-    {
-        int numPointsForInsideEdge = processedTessFactors.numPointsForInsideTessFactor - 2*ring;
-        int edge0InsidePointBaseOffset = insideEdgePointBaseOffset;
-        int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset;
-        for(edge = 0; edge < TRI_EDGES; edge++ )
-        {
-            int numTriangles = numPointsForInsideEdge + numPointsForOutsideEdge[edge] - 2;
-
-            int insideBaseOffset;
-            int outsideBaseOffset;
-            if( edge == 2 )
-            {
-                m_IndexPatchContext.insidePointIndexDeltaToRealValue    = insideEdgePointBaseOffset;
-                m_IndexPatchContext.insidePointIndexBadValue            = numPointsForInsideEdge - 1;
-                m_IndexPatchContext.insidePointIndexReplacementValue    = edge0InsidePointBaseOffset;
-                m_IndexPatchContext.outsidePointIndexPatchBase          = m_IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range
-                m_IndexPatchContext.outsidePointIndexDeltaToRealValue   = outsideEdgePointBaseOffset
-                                                                            - m_IndexPatchContext.outsidePointIndexPatchBase;
-                m_IndexPatchContext.outsidePointIndexBadValue           = m_IndexPatchContext.outsidePointIndexPatchBase
-                                                                            + numPointsForOutsideEdge[edge] - 1;
-                m_IndexPatchContext.outsidePointIndexReplacementValue   = edge0OutsidePointBaseOffset;
-                SetUsingPatchedIndices(true);
-                insideBaseOffset = 0;
-                outsideBaseOffset = m_IndexPatchContext.outsidePointIndexPatchBase;
-            }
-            else
-            {
-                insideBaseOffset = insideEdgePointBaseOffset;
-                outsideBaseOffset = outsideEdgePointBaseOffset;
-            }
-            if( ring == startRing )
-            {
-                StitchTransition(/*baseIndexOffset: */m_NumIndices,
-                               insideBaseOffset,processedTessFactors.insideTessFactorCtx.numHalfTessFactorPoints,processedTessFactors.insideTessFactorParity,
-                               outsideBaseOffset,outsideTessFactorCtx[edge]->numHalfTessFactorPoints,outsideTessFactorParity[edge]);
-            }
-            else
-            {
-                StitchRegular(/*bTrapezoid*/true, DIAGONALS_MIRRORED,
-                              /*baseIndexOffset: */m_NumIndices,
-                              numPointsForInsideEdge,
-                              insideBaseOffset,outsideBaseOffset);
-            }
-            if( 2 == edge )
-            {
-                SetUsingPatchedIndices(false);
-            }
-            m_NumIndices += numTriangles*3;
-            outsideEdgePointBaseOffset += numPointsForOutsideEdge[edge] - 1;
-            insideEdgePointBaseOffset += numPointsForInsideEdge - 1;
-            numPointsForOutsideEdge[edge] = numPointsForInsideEdge;
-        }
-        if( startRing == ring )
-        {
-            for(edge = 0; edge < TRI_EDGES; edge++ )
-            {
-                outsideTessFactorCtx[edge] = &processedTessFactors.insideTessFactorCtx;
-                outsideTessFactorParity[edge] = processedTessFactors.insideTessFactorParity;
-            }
-        }
-    }
-    if( Odd() )
-    {
-        // Triangulate center (a single triangle)
-        DefineClockwiseTriangle(outsideEdgePointBaseOffset, outsideEdgePointBaseOffset+1, outsideEdgePointBaseOffset+2,
-                       m_NumIndices);
-        m_NumIndices += 3;
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::TessellateIsoLineDomain
-// User calls this.
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::TessellateIsoLineDomain( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail )
-{
-    PROCESSED_TESS_FACTORS_ISOLINE processedTessFactors;
-    IsoLineProcessTessFactors(TessFactor_V_LineDensity,TessFactor_U_LineDetail,processedTessFactors);
-    if( processedTessFactors.bPatchCulled )
-    {
-        m_NumPoints = 0;
-        m_NumIndices = 0;
-        return;
-    }
-    IsoLineGeneratePoints(processedTessFactors);
-    IsoLineGenerateConnectivity(processedTessFactors); // can be done in parallel to IsoLineGeneratePoints
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::IsoLineProcessTessFactors
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::IsoLineProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail,
-                                                PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors )
-{
-    // Is the patch culled?
-    if( !(TessFactor_V_LineDensity > 0) || // NaN will pass
-        !(TessFactor_U_LineDetail > 0) )
-    {
-        processedTessFactors.bPatchCulled = true;
-        return;
-    }
-    else
-    {
-        processedTessFactors.bPatchCulled = false;
-    }
-
-    // Clamp edge TessFactors
-    float lowerBound = 0.0, upperBound = 0.0;
-    switch(m_originalPartitioning)
-    {
-        case D3D11_TESSELLATOR_PARTITIONING_INTEGER:
-        case D3D11_TESSELLATOR_PARTITIONING_POW2: // don�t care about pow2 distinction for validation, just treat as integer
-            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR;
-            upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR;
-            break;
-
-        case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN:
-            lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR;
-            upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR;
-            break;
-
-        case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD:
-            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR;
-            upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR;
-            break;
-    }
-
-    TessFactor_V_LineDensity = tess_fmin( D3D11_TESSELLATOR_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR,
-                                    tess_fmax( D3D11_TESSELLATOR_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR, TessFactor_V_LineDensity ) );
-    TessFactor_U_LineDetail = tess_fmin( upperBound, tess_fmax( lowerBound, TessFactor_U_LineDetail ) );
-
-    // Reset our vertex and index buffers.  We have enough storage for the max tessFactor.
-    m_NumPoints = 0;
-    m_NumIndices = 0;
-
-    // Process tessFactors
-    if( HWIntegerPartitioning() )
-    {
-        TessFactor_U_LineDetail = ceil(TessFactor_U_LineDetail);
-        processedTessFactors.lineDetailParity = isEven(TessFactor_U_LineDetail) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-    }
-    else
-    {
-        processedTessFactors.lineDetailParity = m_originalParity;
-    }
-
-    FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail);
-
-    SetTessellationParity(processedTessFactors.lineDetailParity);
-
-    ComputeTessFactorContext(fxpTessFactor_U_LineDetail, processedTessFactors.lineDetailTessFactorCtx);
-    processedTessFactors.numPointsPerLine = NumPointsForTessFactor(fxpTessFactor_U_LineDetail);
-
-    OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING_INTEGER);
-
-    TessFactor_V_LineDensity = ceil(TessFactor_V_LineDensity);
-    processedTessFactors.lineDensityParity = isEven(TessFactor_V_LineDensity) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-    SetTessellationParity(processedTessFactors.lineDensityParity);
-    FXP fxpTessFactor_V_LineDensity = floatToFixed(TessFactor_V_LineDensity);
-    ComputeTessFactorContext(fxpTessFactor_V_LineDensity, processedTessFactors.lineDensityTessFactorCtx);
-
-    processedTessFactors.numLines = NumPointsForTessFactor(fxpTessFactor_V_LineDensity) - 1; // don't draw last line at V == 1.
-
-    RestorePartitioning();
-
-    // Compute some initial data.
-
-    // outside edge offsets
-    m_NumPoints = processedTessFactors.numPointsPerLine * processedTessFactors.numLines;
-    if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT )
-    {
-        m_NumIndices = m_NumPoints;
-    }
-    else // line
-    {
-        m_NumIndices = processedTessFactors.numLines*(processedTessFactors.numPointsPerLine-1)*2;
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::IsoLineGeneratePoints
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::IsoLineGeneratePoints( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors )
-{
-    int line, pointOffset;
-    for(line = 0, pointOffset = 0; line < processedTessFactors.numLines; line++)
-    {
-        for(int point = 0; point < processedTessFactors.numPointsPerLine; point++)
-        {
-            FXP fxpU,fxpV;
-            SetTessellationParity(processedTessFactors.lineDensityParity);
-            PlacePointIn1D(processedTessFactors.lineDensityTessFactorCtx,line,fxpV);
-
-            SetTessellationParity(processedTessFactors.lineDetailParity);
-            PlacePointIn1D(processedTessFactors.lineDetailTessFactorCtx,point,fxpU);
-
-            DefinePoint(fxpU,fxpV,pointOffset++);
-        }
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::IsoLineGenerateConnectivity
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::IsoLineGenerateConnectivity( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors )
-{
-    int line, pointOffset, indexOffset;
-    if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT )
-    {
-        for(line = 0, pointOffset = 0, indexOffset = 0; line < processedTessFactors.numLines; line++)
-        {
-            for(int point = 0; point < processedTessFactors.numPointsPerLine; point++)
-            {
-                DefineIndex(pointOffset++,indexOffset++);
-            }
-        }
-    }
-    else // line
-    {
-        for(line = 0, pointOffset = 0, indexOffset = 0; line < processedTessFactors.numLines; line++)
-        {
-            for(int point = 0; point < processedTessFactors.numPointsPerLine; point++)
-            {
-                if( point > 0 )
-                {
-                    DefineIndex(pointOffset-1,indexOffset++);
-                    DefineIndex(pointOffset,indexOffset++);
-                }
-                pointOffset++;
-            }
-        }
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::GetPointCount
-// User calls this.
-//---------------------------------------------------------------------------------------------------------------------------------
-int CHWTessellator::GetPointCount()
-{
-    return m_NumPoints;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::GetIndexCount()
-// User calls this.
-//---------------------------------------------------------------------------------------------------------------------------------
-int CHWTessellator::GetIndexCount()
-{
-    return m_NumIndices;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::GetPoints()
-// User calls this.
-//---------------------------------------------------------------------------------------------------------------------------------
-DOMAIN_POINT* CHWTessellator::GetPoints()
-{
-    return m_Point;
-}
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::GetIndices()
-// User calls this.
-//---------------------------------------------------------------------------------------------------------------------------------
-int* CHWTessellator::GetIndices()
-{
-    return m_Index;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::DefinePoint()
-//---------------------------------------------------------------------------------------------------------------------------------
-int CHWTessellator::DefinePoint(FXP fxpU, FXP fxpV, int pointStorageOffset)
-{
-//    WCHAR foo[80];
-//    StringCchPrintf(foo,80,L"off:%d, uv=(%f,%f)\n",pointStorageOffset,fixedToFloat(fxpU),fixedToFloat(fxpV));
-//    OutputDebugString(foo);
-    m_Point[pointStorageOffset].u = fixedToFloat(fxpU);
-    m_Point[pointStorageOffset].v = fixedToFloat(fxpV);
-    return pointStorageOffset;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::DefineIndex()
-//--------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::DefineIndex(int index, int indexStorageOffset)
-{
-    index = PatchIndexValue(index);
-//    WCHAR foo[80];
-//    StringCchPrintf(foo,80,L"off:%d, idx=%d, uv=(%f,%f)\n",indexStorageOffset,index,m_Point[index].u,m_Point[index].v);
-//    OutputDebugString(foo);
-    m_Index[indexStorageOffset] = index;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::DefineClockwiseTriangle()
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::DefineClockwiseTriangle(int index0, int index1, int index2, int indexStorageBaseOffset)
-{
-    // inputs a clockwise triangle, stores a CW or CCW triangle depending on the state
-    DefineIndex(index0,indexStorageBaseOffset);
-    bool bWantClockwise = (m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW) ? true : false;
-    if( bWantClockwise )
-    {
-        DefineIndex(index1,indexStorageBaseOffset+1);
-        DefineIndex(index2,indexStorageBaseOffset+2);
-    }
-    else
-    {
-        DefineIndex(index2,indexStorageBaseOffset+1);
-        DefineIndex(index1,indexStorageBaseOffset+2);
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::DumpAllPoints()
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::DumpAllPoints()
-{
-    for( int p = 0; p < m_NumPoints; p++ )
-    {
-        DefineIndex(p,m_NumIndices++);
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::DumpAllPointsAsInOrderLineList()
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::DumpAllPointsAsInOrderLineList()
-{
-    for( int p = 1; p < m_NumPoints; p++ )
-    {
-        DefineIndex(p-1,m_NumIndices++);
-        DefineIndex(p,m_NumIndices++);
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// RemoveMSB
-//---------------------------------------------------------------------------------------------------------------------------------
-int RemoveMSB(int val)
-{
-    int check;
-    if( val <= 0x0000ffff ) { check = ( val <= 0x000000ff ) ? 0x00000080 : 0x00008000; }
-    else                    { check = ( val <= 0x00ffffff ) ? 0x00800000 : 0x80000000; }
-    for( int i = 0; i < 8; i++, check >>= 1 ) { if( val & check ) return (val & ~check); }
-    return 0;
-}
-//---------------------------------------------------------------------------------------------------------------------------------
-// GetMSB
-//---------------------------------------------------------------------------------------------------------------------------------
-int GetMSB(int val)
-{
-    int check;
-    if( val <= 0x0000ffff ) { check = ( val <= 0x000000ff ) ? 0x00000080 : 0x00008000; }
-    else                    { check = ( val <= 0x00ffffff ) ? 0x00800000 : 0x80000000; }
-    for( int i = 0; i < 8; i++, check >>= 1 ) { if( val & check ) return check; }
-    return 0;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::CleanseParameter()
-//---------------------------------------------------------------------------------------------------------------------------------
-/* NOTHING TO DO FOR FIXED POINT ARITHMETIC!
-void CHWTessellator::CleanseParameter(float& parameter)
-{
-    // Clean up [0..1] parameter to guarantee that (1 - (1 - parameter)) == parameter.
-    parameter = 1.0f - parameter;
-    parameter = 1.0f - parameter;
-
-}
-*/
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::NumPointsForTessFactor()
-//---------------------------------------------------------------------------------------------------------------------------------
-int CHWTessellator::NumPointsForTessFactor( FXP fxpTessFactor )
-{
-    int numPoints;
-    if( Odd() )
-    {
-        numPoints = (fxpCeil(FXP_ONE_HALF + (fxpTessFactor+1/*round*/)/2)*2)>>FXP_FRACTION_BITS;
-    }
-    else
-    {
-        numPoints = ((fxpCeil((fxpTessFactor+1/*round*/)/2)*2)>>FXP_FRACTION_BITS)+1;
-    }
-    return numPoints;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::ComputeTessFactorContext()
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::ComputeTessFactorContext( FXP fxpTessFactor, TESS_FACTOR_CONTEXT& TessFactorCtx )
-{
-    FXP fxpHalfTessFactor = (fxpTessFactor+1/*round*/)/2;
-    if( Odd() || (fxpHalfTessFactor == FXP_ONE_HALF)) // fxpHalfTessFactor == 1/2 if TessFactor is 1, but we're pretending we are even.
-    {
-        fxpHalfTessFactor += FXP_ONE_HALF;
-    }
-    FXP fxpFloorHalfTessFactor = fxpFloor(fxpHalfTessFactor);
-    FXP fxpCeilHalfTessFactor = fxpCeil(fxpHalfTessFactor);
-    TessFactorCtx.fxpHalfTessFactorFraction = fxpHalfTessFactor - fxpFloorHalfTessFactor;
-    //CleanseParameter(TessFactorCtx.fxpHalfTessFactorFraction);
-    TessFactorCtx.numHalfTessFactorPoints = (fxpCeilHalfTessFactor>>FXP_FRACTION_BITS); // for EVEN, we don't include the point always fixed at the midpoint of the TessFactor
-    if( fxpCeilHalfTessFactor == fxpFloorHalfTessFactor )
-    {
-        TessFactorCtx.splitPointOnFloorHalfTessFactor =  /*pick value to cause this to be ignored*/ TessFactorCtx.numHalfTessFactorPoints+1;
-    }
-    else if( Odd() )
-    {
-        if( fxpFloorHalfTessFactor == FXP_ONE )
-        {
-            TessFactorCtx.splitPointOnFloorHalfTessFactor = 0;
-        }
-        else
-        {
-#ifdef ALLOW_XBOX_360_COMPARISON
-            if( m_bXBox360Mode )
-                TessFactorCtx.splitPointOnFloorHalfTessFactor = TessFactorCtx.numHalfTessFactorPoints-2;
-            else
-#endif
-				TessFactorCtx.splitPointOnFloorHalfTessFactor = (RemoveMSB((fxpFloorHalfTessFactor>>FXP_FRACTION_BITS)-1)<<1) + 1;
-        }
-    }
-    else
-    {
-#ifdef ALLOW_XBOX_360_COMPARISON
-        if( m_bXBox360Mode )
-            TessFactorCtx.splitPointOnFloorHalfTessFactor = TessFactorCtx.numHalfTessFactorPoints-1;
-        else
-#endif
-			TessFactorCtx.splitPointOnFloorHalfTessFactor = (RemoveMSB(fxpFloorHalfTessFactor>>FXP_FRACTION_BITS)<<1) + 1;
-    }
-    int numFloorSegments = (fxpFloorHalfTessFactor * 2)>>FXP_FRACTION_BITS;
-    int numCeilSegments = (fxpCeilHalfTessFactor * 2)>>FXP_FRACTION_BITS;
-    if( Odd() )
-    {
-        numFloorSegments -= 1;
-        numCeilSegments -= 1;
-    }
-    TessFactorCtx.fxpInvNumSegmentsOnFloorTessFactor = s_fixedReciprocal[numFloorSegments];
-    TessFactorCtx.fxpInvNumSegmentsOnCeilTessFactor = s_fixedReciprocal[numCeilSegments];
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::PlacePointIn1D()
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::PlacePointIn1D( const TESS_FACTOR_CONTEXT& TessFactorCtx, int point, FXP& fxpLocation )
-{
-    bool bFlip;
-    if( point >= TessFactorCtx.numHalfTessFactorPoints )
-    {
-        point = (TessFactorCtx.numHalfTessFactorPoints << 1) - point;
-        if( Odd() )
-        {
-            point -= 1;
-        }
-        bFlip = true;
-    }
-    else
-    {
-        bFlip = false;
-    }
-    if( point == TessFactorCtx.numHalfTessFactorPoints )
-    {
-        fxpLocation = FXP_ONE_HALF; // special casing middle since 16 bit fixed math below can't reproduce 0.5 exactly
-        return;
-    }
-    unsigned int indexOnCeilHalfTessFactor = point;
-    unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor;
-    if( point > TessFactorCtx.splitPointOnFloorHalfTessFactor )
-    {
-        indexOnFloorHalfTessFactor -= 1;
-    }
-    // For the fixed point multiplies below, we know the results are <= 16 bits because
-    // the locations on the halfTessFactor are <= half the number of segments for the total TessFactor.
-    // So a number divided by a number that is at least twice as big will give
-    // a result no bigger than 0.5 (which in fixed point is 16 bits in our case)
-    FXP fxpLocationOnFloorHalfTessFactor = indexOnFloorHalfTessFactor * TessFactorCtx.fxpInvNumSegmentsOnFloorTessFactor;
-    FXP fxpLocationOnCeilHalfTessFactor = indexOnCeilHalfTessFactor * TessFactorCtx.fxpInvNumSegmentsOnCeilTessFactor;
-
-    // Since we know the numbers calculated above are <= fixed point 0.5, and the equation
-    // below is just lerping between two values <= fixed point 0.5 (0x00008000), then we know
-    // that the final result before shifting by 16 bits is no larger than 0x80000000.  Once we
-    // shift that down by 16, we get the result of lerping 2 numbers <= 0.5, which is obviously
-    // at most 0.5 (0x00008000)
-    fxpLocation = fxpLocationOnFloorHalfTessFactor * (FXP_ONE - TessFactorCtx.fxpHalfTessFactorFraction) +
-                  fxpLocationOnCeilHalfTessFactor * (TessFactorCtx.fxpHalfTessFactorFraction);
-    fxpLocation = (fxpLocation + FXP_ONE_HALF/*round*/) >> FXP_FRACTION_BITS; // get back to n.16
-    /* Commenting out floating point version.  Note the parameter cleansing it does is not needed in fixed point.
-    if( bFlip )
-        location = 1.0f - location; // complement produces cleansed result.
-    else
-        CleanseParameter(location);
-    */
-    if( bFlip )
-    {
-        fxpLocation = FXP_ONE - fxpLocation;
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::StitchRegular
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::StitchRegular(bool bTrapezoid,DIAGONALS diagonals,
-                                 int baseIndexOffset, int numInsideEdgePoints,
-                                 int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset)
-{
-    int insidePoint = insideEdgePointBaseOffset;
-    int outsidePoint = outsideEdgePointBaseOffset;
-    if( bTrapezoid )
-    {
-        DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset);
-        baseIndexOffset += 3; outsidePoint++;
-    }
-    int p;
-    switch( diagonals )
-    {
-    case DIAGONALS_INSIDE_TO_OUTSIDE:
-        // Diagonals pointing from inside edge forward towards outside edge
-        for( p = 0; p < numInsideEdgePoints-1; p++ )
-        {
-            DefineClockwiseTriangle(insidePoint,outsidePoint,outsidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-
-            DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-            insidePoint++; outsidePoint++;
-        }
-        break;
-    case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
-        // Diagonals pointing from outside edge forward towards inside edge
-
-        // First half
-        for( p = 0; p < numInsideEdgePoints/2-1; p++ )
-        {
-            DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset);
-            baseIndexOffset += 3;
-            DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-            insidePoint++; outsidePoint++;
-        }
-
-        // Middle
-        DefineClockwiseTriangle(outsidePoint,insidePoint+1,insidePoint,baseIndexOffset);
-        baseIndexOffset += 3;
-        DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset);
-        baseIndexOffset += 3;
-        insidePoint++; outsidePoint++; p+=2;
-
-        // Second half
-        for( ; p < numInsideEdgePoints; p++ )
-        {
-            DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset);
-            baseIndexOffset += 3;
-            DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-            insidePoint++; outsidePoint++;
-        }
-        break;
-    case DIAGONALS_MIRRORED:
-        // First half, diagonals pointing from outside of outside edge to inside of inside edge
-        for( p = 0; p < numInsideEdgePoints/2; p++ )
-        {
-            DefineClockwiseTriangle(outsidePoint,insidePoint+1,insidePoint,baseIndexOffset);
-            baseIndexOffset += 3;
-            DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-            insidePoint++; outsidePoint++;
-        }
-        // Second half, diagonals pointing from inside of inside edge to outside of outside edge
-        for( ; p < numInsideEdgePoints-1; p++ )
-        {
-            DefineClockwiseTriangle(insidePoint,outsidePoint,outsidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-            DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-            insidePoint++; outsidePoint++;
-        }
-        break;
-    }
-    if( bTrapezoid )
-    {
-        DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset);
-        baseIndexOffset += 3;
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::StitchTransition()
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHWTessellator::StitchTransition(int baseIndexOffset,
-                                    int insideEdgePointBaseOffset, int insideNumHalfTessFactorPoints,
-                                    TESSELLATOR_PARITY insideEdgeTessFactorParity,
-                                    int outsideEdgePointBaseOffset, int outsideNumHalfTessFactorPoints,
-                                    TESSELLATOR_PARITY outsideTessFactorParity
-)
-{
-
-#ifdef ALLOW_XBOX_360_COMPARISON
-    // Tables to assist in the stitching of 2 rows of points having arbitrary TessFactors.
-    // The stitching order is governed by Ruler Function vertex split ordering (see external documentation).
-    //
-    // The contents of the finalPointPositionTable are where vertex i [0..32] ends up on the half-edge
-    // at the max tessellation amount given ruler-function split order.
-    // Recall the other half of an edge is mirrored, so we only need to deal with one half.
-    // This table is used to decide when to advance a point on the interior or exterior.
-    // It supports odd TessFactor up to 65 and even TessFactor up to 64.
-    static const int _finalPointPositionTable[33] =
-            { 0, 32, 16, 8, 17, 4, 18, 9, 19, 2, 20, 10, 21, 5, 22, 11, 23,
-              1, 24, 12, 25, 6, 26, 13, 27, 3, 28, 14, 29, 7, 30, 15, 31 };
-    // The loopStart and loopEnd tables below just provide optimal loop bounds for the
-    // stitching algorithm further below, for any given halfTssFactor.
-    // There is probably a better way to encode this...
-
-    // loopStart[halfTessFactor] encodes the FIRST entry other that [0] in finalPointPositionTable[] above which is
-    // less than halfTessFactor.  Exceptions are entry 0 and 1, which are set up to skip the loop.
-    static const int _loopStart[33] =
-            {1,1,17,9,9,5,5,5,5,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};
-    // loopStart[halfTessFactor] encodes the LAST entry in finalPointPositionTable[] above which is
-    // less than halfTessFactor.  Exceptions are entry 0 and 1, which are set up to skip the loop.
-    static const int _loopEnd[33] =
-            {0,0,17,17,25,25,25,25,29,29,29,29,29,29,29,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,32};
-    const int* finalPointPositionTable;
-    const int* loopStart;
-    const int* loopEnd;
-    if( m_bXBox360Mode )
-    {
-        // The XBox360 vertex introduction order is always from the center of the edge.
-        // So the final positions of points on the half-edge are this trivial table.
-        static const int XBOXfinalPointPositionTable[33] =
-                { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
-                  18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 };
-        // loopStart and loopEnd (meaning described above) also become trivial for XBox360 splitting.
-        static const int XBOXloopStart[33] =
-                {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
-        static const int XBOXloopEnd[33] =
-                {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
-
-        finalPointPositionTable = XBOXfinalPointPositionTable;
-        loopStart = XBOXloopStart;
-        loopEnd = XBOXloopEnd;
-    }
-    else
-    {
-        finalPointPositionTable = _finalPointPositionTable;
-        loopStart = _loopStart;
-        loopEnd =_loopEnd;
-    }
-#else
-    // Tables to assist in the stitching of 2 rows of points having arbitrary TessFactors.
-    // The stitching order is governed by Ruler Function vertex split ordering (see external documentation).
-    //
-    // The contents of the finalPointPositionTable are where vertex i [0..33] ends up on the half-edge
-    // at the max tessellation amount given ruler-function split order.
-    // Recall the other half of an edge is mirrored, so we only need to deal with one half.
-    // This table is used to decide when to advance a point on the interior or exterior.
-    // It supports odd TessFactor up to 65 and even TessFactor up to 64.
-    static const int finalPointPositionTable[33] =
-            { 0, 32, 16, 8, 17, 4, 18, 9, 19, 2, 20, 10, 21, 5, 22, 11, 23,
-              1, 24, 12, 25, 6, 26, 13, 27, 3, 28, 14, 29, 7, 30, 15, 31 };
-
-    // The loopStart and loopEnd tables below just provide optimal loop bounds for the
-    // stitching algorithm further below, for any given halfTssFactor.
-    // There is probably a better way to encode this...
-
-    // loopStart[halfTessFactor] encodes the FIRST entry in finalPointPositionTable[] above which is
-    // less than halfTessFactor.  Exceptions are entry 0 and 1, which are set up to skip the loop.
-    static const int loopStart[33] =
-            {1,1,17,9,9,5,5,5,5,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};
-    // loopStart[halfTessFactor] encodes the LAST entry in finalPointPositionTable[] above which is
-    // less than halfTessFactor.  Exceptions are entry 0 and 1, which are set up to skip the loop.
-    static const int loopEnd[33] =
-            {0,0,17,17,25,25,25,25,29,29,29,29,29,29,29,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,32};
-#endif
-    if( TESSELLATOR_PARITY_ODD == insideEdgeTessFactorParity )
-    {
-        insideNumHalfTessFactorPoints -= 1;
-    }
-    if( TESSELLATOR_PARITY_ODD == outsideTessFactorParity )
-    {
-        outsideNumHalfTessFactorPoints -= 1;
-    }
-    // Walk first half
-    int outsidePoint = outsideEdgePointBaseOffset;
-    int insidePoint = insideEdgePointBaseOffset;
-
-    // iStart,iEnd are a small optimization so the loop below doesn't have to go from 0 up to 31
-    int iStart = min(loopStart[insideNumHalfTessFactorPoints],loopStart[outsideNumHalfTessFactorPoints]);
-    int iEnd = max(loopEnd[insideNumHalfTessFactorPoints],loopEnd[outsideNumHalfTessFactorPoints]);
-
-    if( finalPointPositionTable[0] < outsideNumHalfTessFactorPoints ) // since we dont' start the loop at 0 below, we need a special case.
-    {
-        // Advance outside
-        DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset);
-        baseIndexOffset += 3; outsidePoint++;
-    }
-
-    for(int i = iStart; i <= iEnd; i++)
-    {
-        if( /*(i>0) && <-- not needed since iStart is never 0*/(finalPointPositionTable[i] < insideNumHalfTessFactorPoints))
-        {
-            // Advance inside
-            DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3; insidePoint++;
-        }
-        if((finalPointPositionTable[i] < outsideNumHalfTessFactorPoints))
-        {
-            // Advance outside
-            DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset);
-            baseIndexOffset += 3; outsidePoint++;
-        }
-    }
-
-    if( (insideEdgeTessFactorParity != outsideTessFactorParity) || (insideEdgeTessFactorParity == TESSELLATOR_PARITY_ODD))
-    {
-        if( insideEdgeTessFactorParity == outsideTessFactorParity )
-        {
-            // Quad in the middle
-            DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-            DefineClockwiseTriangle(insidePoint+1,outsidePoint,outsidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-            insidePoint++;
-            outsidePoint++;
-        }
-        else if( TESSELLATOR_PARITY_EVEN == insideEdgeTessFactorParity )
-        {
-            // Triangle pointing inside
-            DefineClockwiseTriangle(insidePoint,outsidePoint,outsidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-            outsidePoint++;
-        }
-        else
-        {
-            // Triangle pointing outside
-            DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3;
-            insidePoint++;
-        }
-    }
-
-    // Walk second half.
-    for(int i = iEnd; i >= iStart; i--)
-    {
-        if((finalPointPositionTable[i] < outsideNumHalfTessFactorPoints))
-        {
-            // Advance outside
-            DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset);
-            baseIndexOffset += 3; outsidePoint++;
-        }
-        if( /*(i>0) && <-- not needed since iStart is never 0*/ (finalPointPositionTable[i] < insideNumHalfTessFactorPoints))
-        {
-            // Advance inside
-            DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset);
-            baseIndexOffset += 3; insidePoint++;
-        }
-    }
-    // Below case is not needed if we didn't optimize loop above and made it run from 31 down to 0.
-    if((finalPointPositionTable[0] < outsideNumHalfTessFactorPoints))
-    {
-        DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset);
-        baseIndexOffset += 3; outsidePoint++;
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHWTessellator::PatchIndexValue()
-//--------------------------------------------------------------------------------------------------------------------------------
-int CHWTessellator::PatchIndexValue(int index)
-{
-    if( m_bUsingPatchedIndices )
-    {
-        if( index >= m_IndexPatchContext.outsidePointIndexPatchBase ) // assumed remapped outide indices are > remapped inside vertices
-        {
-            if( index == m_IndexPatchContext.outsidePointIndexBadValue )
-                index = m_IndexPatchContext.outsidePointIndexReplacementValue;
-            else
-                index += m_IndexPatchContext.outsidePointIndexDeltaToRealValue;
-        }
-        else
-        {
-            if( index == m_IndexPatchContext.insidePointIndexBadValue )
-                index = m_IndexPatchContext.insidePointIndexReplacementValue;
-            else
-                index += m_IndexPatchContext.insidePointIndexDeltaToRealValue;
-        }
-    }
-    else if( m_bUsingPatchedIndices2 )
-    {
-        if( index >= m_IndexPatchContext2.baseIndexToInvert )
-        {
-            if( index == m_IndexPatchContext2.cornerCaseBadValue )
-            {
-                index = m_IndexPatchContext2.cornerCaseReplacementValue;
-            }
-            else
-            {
-                index = m_IndexPatchContext2.indexInversionEndPoint - index;
-            }
-        }
-        else if( index == m_IndexPatchContext2.cornerCaseBadValue )
-        {
-            index = m_IndexPatchContext2.cornerCaseReplacementValue;
-        }
-    }
-    return index;
-}
-
-
-//=================================================================================================================================
-// CHLSLTessellator
-//=================================================================================================================================
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::CHLSLTessellator
-//---------------------------------------------------------------------------------------------------------------------------------
-CHLSLTessellator::CHLSLTessellator()
-{
-    m_LastComputedTessFactors[0] = m_LastComputedTessFactors[1] = m_LastComputedTessFactors[2] =
-    m_LastComputedTessFactors[3] = m_LastComputedTessFactors[4] = m_LastComputedTessFactors[5] = 0;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::Init
-// User calls this.
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHLSLTessellator::Init(
-    D3D11_TESSELLATOR_PARTITIONING       partitioning,
-    D3D11_TESSELLATOR_REDUCTION          insideTessFactorReduction,
-    D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS quadInsideTessFactorReductionAxis,
-    D3D11_TESSELLATOR_OUTPUT_PRIMITIVE   outputPrimitive)
-{
-    CHWTessellator::Init(partitioning,outputPrimitive);
-    m_LastComputedTessFactors[0] = m_LastComputedTessFactors[1] = m_LastComputedTessFactors[2] =
-    m_LastComputedTessFactors[3] = m_LastComputedTessFactors[4] = m_LastComputedTessFactors[5] = 0;
-    m_partitioning = partitioning;
-    m_originalPartitioning = partitioning;
-    switch( partitioning )
-    {
-    case D3D11_TESSELLATOR_PARTITIONING_INTEGER:
-    default:
-        break;
-    case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD:
-        m_parity = TESSELLATOR_PARITY_ODD;
-        break;
-    case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN:
-        m_parity = TESSELLATOR_PARITY_EVEN;
-        break;
-    }
-    m_originalParity = m_parity;
-    m_outputPrimitive = outputPrimitive;
-    m_insideTessFactorReduction = insideTessFactorReduction;
-    m_quadInsideTessFactorReductionAxis = quadInsideTessFactorReductionAxis;
-}
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::TessellateQuadDomain
-// User calls this
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHLSLTessellator::TessellateQuadDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1,
-                                         float insideTessFactorScaleU, float insideTessFactorScaleV )
-{
-    QuadHLSLProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Ueq1,tessFactor_Veq1,insideTessFactorScaleU,insideTessFactorScaleV);
-
-    CHWTessellator::TessellateQuadDomain(m_LastComputedTessFactors[0],m_LastComputedTessFactors[1],m_LastComputedTessFactors[2],m_LastComputedTessFactors[3],
-                                         m_LastComputedTessFactors[4],m_LastComputedTessFactors[5]);
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::QuadHLSLProcessTessFactors
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHLSLTessellator::QuadHLSLProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1,
-                                               float insideTessFactorScaleU, float insideTessFactorScaleV )
-{
-    if( !(tessFactor_Ueq0 > 0) ||// NaN will pass
-        !(tessFactor_Veq0 > 0) ||
-        !(tessFactor_Ueq1 > 0) ||
-        !(tessFactor_Veq1 > 0) )
-    {
-        m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0;
-        m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0;
-        m_LastUnRoundedComputedTessFactors[2] = tessFactor_Ueq1;
-        m_LastUnRoundedComputedTessFactors[3] = tessFactor_Veq1;
-        m_LastUnRoundedComputedTessFactors[4] = 0;
-        m_LastUnRoundedComputedTessFactors[5] = 0;
-        m_LastComputedTessFactors[0] =
-        m_LastComputedTessFactors[1] =
-        m_LastComputedTessFactors[2] =
-        m_LastComputedTessFactors[3] =
-        m_LastComputedTessFactors[4] =
-        m_LastComputedTessFactors[5] = 0;
-        return;
-    }
-
-    CleanupFloatTessFactor(tessFactor_Ueq0);// clamp to [1.0f..INF], NaN->1.0f
-    CleanupFloatTessFactor(tessFactor_Veq0);
-    CleanupFloatTessFactor(tessFactor_Ueq1);
-    CleanupFloatTessFactor(tessFactor_Veq1);
-
-    // Save off tessFactors so they can be returned to app
-    m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0;
-    m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0;
-    m_LastUnRoundedComputedTessFactors[2] = tessFactor_Ueq1;
-    m_LastUnRoundedComputedTessFactors[3] = tessFactor_Veq1;
-
-    // Process outside tessFactors
-    float outsideTessFactor[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Ueq1, tessFactor_Veq1};
-    int edge, axis;
-    TESSELLATOR_PARITY insideTessFactorParity[QUAD_AXES];
-    if( Pow2Partitioning() || IntegerPartitioning() )
-    {
-        for( edge = 0; edge < QUAD_EDGES; edge++ )
-        {
-            RoundUpTessFactor(outsideTessFactor[edge]);
-            ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode
-        }
-    }
-    else
-    {
-        SetTessellationParity(m_originalParity); // ClampTessFactor needs it
-        for( edge = 0; edge < QUAD_EDGES; edge++ )
-        {
-            ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode
-        }
-    }
-
-    // Compute inside TessFactors
-    float insideTessFactor[QUAD_AXES] = {0.0};
-    if( m_quadInsideTessFactorReductionAxis == D3D11_TESSELLATOR_QUAD_REDUCTION_1_AXIS )
-    {
-        switch( m_insideTessFactorReduction )
-        {
-        case D3D11_TESSELLATOR_REDUCTION_MIN:
-            insideTessFactor[U] = tess_fmin(tess_fmin(tessFactor_Veq0,tessFactor_Veq1),tess_fmin(tessFactor_Ueq0,tessFactor_Ueq1));
-            break;
-        case D3D11_TESSELLATOR_REDUCTION_MAX:
-            insideTessFactor[U] = tess_fmax(tess_fmax(tessFactor_Veq0,tessFactor_Veq1),tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1));
-            break;
-        case D3D11_TESSELLATOR_REDUCTION_AVERAGE:
-            insideTessFactor[U] = (tessFactor_Veq0 + tessFactor_Veq1 + tessFactor_Ueq0 + tessFactor_Ueq1) / 4;
-            break;
-        }
-        // Scale inside tessFactor based on user scale factor.
-
-        ClampFloatTessFactorScale(insideTessFactorScaleU); // clamp scale value to [0..1], NaN->0
-        insideTessFactor[U] = insideTessFactor[U]*insideTessFactorScaleU;
-
-        // Compute inside parity
-        if( Pow2Partitioning() || IntegerPartitioning() )
-        {
-            ClampTessFactor(insideTessFactor[U]); // clamp reduction + scale result that is based on unbounded user input
-            m_LastUnRoundedComputedTessFactors[4] = m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app
-            RoundUpTessFactor(insideTessFactor[U]);
-            insideTessFactorParity[U] =
-            insideTessFactorParity[V] =
-                (isEven(insideTessFactor[U]) || (FLOAT_ONE == insideTessFactor[U]) )
-                ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-        }
-        else
-        {
-            ClampTessFactor(insideTessFactor[U]); // clamp reduction + scale result that is based on unbounded user input
-            m_LastUnRoundedComputedTessFactors[4] = m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app
-            // no parity changes for fractional tessellation - just use what the user requested
-            insideTessFactorParity[U] = insideTessFactorParity[V] = m_originalParity;
-        }
-
-        // To prevent snapping on edges, the "picture frame" comes
-        // in using avg or max (and ignore inside TessFactor scaling) until it is at least 3.
-        if( (TESSELLATOR_PARITY_ODD == insideTessFactorParity[U]) &&
-            (insideTessFactor[U] < FLOAT_THREE) )
-        {
-            if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction)
-            {
-                insideTessFactor[U] = tess_fmin(FLOAT_THREE,tess_fmax(tess_fmax(tessFactor_Veq0,tessFactor_Veq1),tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1)));
-            }
-            else
-            {
-                insideTessFactor[U] = tess_fmin(FLOAT_THREE,(tessFactor_Veq0 + tessFactor_Veq1 + tessFactor_Ueq0 + tessFactor_Ueq1) / 4);
-            }
-            ClampTessFactor(insideTessFactor[U]); // clamp reduction result that is based on unbounded user input
-            m_LastUnRoundedComputedTessFactors[4] = m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app
-            if( IntegerPartitioning())
-            {
-                RoundUpTessFactor(insideTessFactor[U]);
-                insideTessFactorParity[U] =
-                insideTessFactorParity[V] = isEven(insideTessFactor[U]) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-            }
-        }
-        insideTessFactor[V] = insideTessFactor[U];
-    }
-    else
-    {
-        switch( m_insideTessFactorReduction )
-        {
-        case D3D11_TESSELLATOR_REDUCTION_MIN:
-            insideTessFactor[U] = tess_fmin(tessFactor_Veq0,tessFactor_Veq1);
-            insideTessFactor[V] = tess_fmin(tessFactor_Ueq0,tessFactor_Ueq1);
-            break;
-        case D3D11_TESSELLATOR_REDUCTION_MAX:
-            insideTessFactor[U] = tess_fmax(tessFactor_Veq0,tessFactor_Veq1);
-            insideTessFactor[V] = tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1);
-            break;
-        case D3D11_TESSELLATOR_REDUCTION_AVERAGE:
-            insideTessFactor[U] = (tessFactor_Veq0 + tessFactor_Veq1) / 2;
-            insideTessFactor[V] = (tessFactor_Ueq0 + tessFactor_Ueq1) / 2;
-            break;
-        }
-        // Scale inside tessFactors based on user scale factor.
-
-        ClampFloatTessFactorScale(insideTessFactorScaleU); // clamp scale value to [0..1], NaN->0
-        ClampFloatTessFactorScale(insideTessFactorScaleV);
-        insideTessFactor[U] = insideTessFactor[U]*insideTessFactorScaleU;
-        insideTessFactor[V] = insideTessFactor[V]*insideTessFactorScaleV;
-
-        // Compute inside parity
-        if( Pow2Partitioning() || IntegerPartitioning() )
-        {
-            for( axis = 0; axis < QUAD_AXES; axis++ )
-            {
-                ClampTessFactor(insideTessFactor[axis]); // clamp reduction + scale result that is based on unbounded user input
-                m_LastUnRoundedComputedTessFactors[4+axis] = insideTessFactor[axis]; // Save off TessFactors so they can be returned to app
-                RoundUpTessFactor(insideTessFactor[axis]);
-                insideTessFactorParity[axis] =
-                    (isEven(insideTessFactor[axis]) || (FLOAT_ONE == insideTessFactor[axis]) )
-                    ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-            }
-        }
-        else
-        {
-            ClampTessFactor(insideTessFactor[U]); // clamp reduction + scale result that is based on unbounded user input
-            ClampTessFactor(insideTessFactor[V]); // clamp reduction + scale result that is based on unbounded user input
-            m_LastUnRoundedComputedTessFactors[4] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app
-            m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[V]; // Save off TessFactors so they can be returned to app
-             // no parity changes for fractional tessellation - just use what the user requested
-            insideTessFactorParity[U] = insideTessFactorParity[V] = m_originalParity;
-        }
-
-        // To prevent snapping on edges, the "picture frame" comes
-        // in using avg or max (and ignore inside TessFactor scaling) until it is at least 3.
-        if( (TESSELLATOR_PARITY_ODD == insideTessFactorParity[U]) &&
-            (insideTessFactor[U] < FLOAT_THREE) )
-        {
-            if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction)
-            {
-                insideTessFactor[U] = tess_fmin(FLOAT_THREE,tess_fmax(tessFactor_Veq0,tessFactor_Veq1));
-            }
-            else
-            {
-                insideTessFactor[U] = tess_fmin(FLOAT_THREE,(tessFactor_Veq0 + tessFactor_Veq1) / 2);
-            }
-            ClampTessFactor(insideTessFactor[U]); // clamp reduction result that is based on unbounded user input
-            m_LastUnRoundedComputedTessFactors[4] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app
-            if( IntegerPartitioning())
-            {
-                RoundUpTessFactor(insideTessFactor[U]);
-                insideTessFactorParity[U] = isEven(insideTessFactor[U]) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-            }
-        }
-
-        if( (TESSELLATOR_PARITY_ODD == insideTessFactorParity[V]) &&
-            (insideTessFactor[V] < FLOAT_THREE) )
-        {
-            if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction)
-            {
-                insideTessFactor[V] = tess_fmin(FLOAT_THREE,tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1));
-            }
-            else
-            {
-                insideTessFactor[V] = tess_fmin(FLOAT_THREE,(tessFactor_Ueq0 + tessFactor_Ueq1) / 2);
-            }
-            ClampTessFactor(insideTessFactor[V]);// clamp reduction result that is based on unbounded user input
-            m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[V]; // Save off TessFactors so they can be returned to app
-            if( IntegerPartitioning())
-            {
-                RoundUpTessFactor(insideTessFactor[V]);
-                insideTessFactorParity[V] = isEven(insideTessFactor[V]) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-            }
-        }
-
-        for( axis = 0; axis < QUAD_AXES; axis++ )
-        {
-            if( TESSELLATOR_PARITY_ODD == insideTessFactorParity[axis] )
-            {
-                // Ensure the first ring ("picture frame") interpolates in on all sides
-                // as much as the side with the minimum TessFactor.  Prevents snapping to edge.
-                if( (insideTessFactor[axis] < FLOAT_THREE) && (insideTessFactor[axis] < insideTessFactor[(axis+1)&0x1]))
-                {
-                    insideTessFactor[axis] = tess_fmin(insideTessFactor[(axis+1)&0x1],FLOAT_THREE);
-                    m_LastUnRoundedComputedTessFactors[4+axis] = insideTessFactor[axis]; // Save off TessFactors so they can be returned to app
-                }
-            }
-        }
-    }
-
-    // Save off TessFactors so they can be returned to app
-    m_LastComputedTessFactors[0] = outsideTessFactor[Ueq0];
-    m_LastComputedTessFactors[1] = outsideTessFactor[Veq0];
-    m_LastComputedTessFactors[2] = outsideTessFactor[Ueq1];
-    m_LastComputedTessFactors[3] = outsideTessFactor[Veq1];
-    m_LastComputedTessFactors[4] = insideTessFactor[U];
-    m_LastComputedTessFactors[5] = insideTessFactor[V];
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::TessellateTriDomain
-// User calls this
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHLSLTessellator::TessellateTriDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0,
-                                        float insideTessFactorScale )
-{
-    TriHLSLProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Weq0,insideTessFactorScale);
-
-    CHWTessellator::TessellateTriDomain(m_LastComputedTessFactors[0],m_LastComputedTessFactors[1],m_LastComputedTessFactors[2],m_LastComputedTessFactors[3]);
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::TriHLSLProcessTessFactors
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHLSLTessellator::TriHLSLProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0,
-                                  float insideTessFactorScale )
-{
-    if( !(tessFactor_Ueq0 > 0) || // NaN will pass
-        !(tessFactor_Veq0 > 0) ||
-        !(tessFactor_Weq0 > 0) )
-    {
-        m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0;
-        m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0;
-        m_LastUnRoundedComputedTessFactors[2] = tessFactor_Weq0;
-        m_LastUnRoundedComputedTessFactors[3] =
-        m_LastComputedTessFactors[0] =
-        m_LastComputedTessFactors[1] =
-        m_LastComputedTessFactors[2] =
-        m_LastComputedTessFactors[3] = 0;
-        return;
-    }
-
-    CleanupFloatTessFactor(tessFactor_Ueq0); // clamp to [1.0f..INF], NaN->1.0f
-    CleanupFloatTessFactor(tessFactor_Veq0);
-    CleanupFloatTessFactor(tessFactor_Weq0);
-
-    // Save off TessFactors so they can be returned to app
-    m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0;
-    m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0;
-    m_LastUnRoundedComputedTessFactors[2] = tessFactor_Weq0;
-
-    // Process outside TessFactors
-    float outsideTessFactor[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Weq0};
-    int edge;
-    if( Pow2Partitioning() || IntegerPartitioning() )
-    {
-        for( edge = 0; edge < TRI_EDGES; edge++ )
-        {
-            RoundUpTessFactor(outsideTessFactor[edge]); // for pow2 this rounds to pow2
-            ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode
-        }
-    }
-    else
-    {
-        for( edge = 0; edge < TRI_EDGES; edge++ )
-        {
-            ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode
-        }
-    }
-
-    // Compute inside TessFactor
-    float insideTessFactor = 0.0;
-    switch( m_insideTessFactorReduction )
-    {
-    case D3D11_TESSELLATOR_REDUCTION_MIN:
-        insideTessFactor = tess_fmin(tess_fmin(tessFactor_Ueq0,tessFactor_Veq0),tessFactor_Weq0);
-        break;
-    case D3D11_TESSELLATOR_REDUCTION_MAX:
-        insideTessFactor = tess_fmax(tess_fmax(tessFactor_Ueq0,tessFactor_Veq0),tessFactor_Weq0);
-        break;
-    case D3D11_TESSELLATOR_REDUCTION_AVERAGE:
-        insideTessFactor = (tessFactor_Ueq0 + tessFactor_Veq0 + tessFactor_Weq0) / 3;
-        break;
-    }
-
-    // Scale inside TessFactor based on user scale factor.
-    ClampFloatTessFactorScale(insideTessFactorScale); // clamp scale value to [0..1], NaN->0
-    insideTessFactor = insideTessFactor*tess_fmin(FLOAT_ONE,insideTessFactorScale);
-
-    ClampTessFactor(insideTessFactor); // clamp reduction + scale result that is based on unbounded user input
-    m_LastUnRoundedComputedTessFactors[3] = insideTessFactor;// Save off TessFactors so they can be returned to app
-    TESSELLATOR_PARITY parity;
-    if( Pow2Partitioning() || IntegerPartitioning() )
-    {
-        RoundUpTessFactor(insideTessFactor);
-        parity = (isEven(insideTessFactor) || (FLOAT_ONE == insideTessFactor))
-                                        ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
-    }
-    else
-    {
-        parity = m_originalParity;
-    }
-
-    if( (TESSELLATOR_PARITY_ODD == parity) &&
-        (insideTessFactor < FLOAT_THREE))
-    {
-        // To prevent snapping on edges, the "picture frame" comes
-        // in using avg or max (and ignore inside TessFactor scaling) until it is at least 3.
-        if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction)
-        {
-            insideTessFactor = tess_fmin(FLOAT_THREE,tess_fmax(tessFactor_Ueq0,tess_fmax(tessFactor_Veq0,tessFactor_Weq0)));
-        }
-        else
-        {
-            insideTessFactor = tess_fmin(FLOAT_THREE,(tessFactor_Ueq0 + tessFactor_Veq0 + tessFactor_Weq0) / 3);
-        }
-        ClampTessFactor(insideTessFactor); // clamp reduction result that is based on unbounded user input
-        m_LastUnRoundedComputedTessFactors[3] = insideTessFactor;// Save off TessFactors so they can be returned to app
-        if( IntegerPartitioning())
-        {
-            RoundUpTessFactor(insideTessFactor);
-        }
-    }
-
-    // Save off TessFactors so they can be returned to app
-    m_LastComputedTessFactors[0] = outsideTessFactor[Ueq0];
-    m_LastComputedTessFactors[1] = outsideTessFactor[Veq0];
-    m_LastComputedTessFactors[2] = outsideTessFactor[Weq0];
-    m_LastComputedTessFactors[3] = insideTessFactor;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::TessellateIsoLineDomain
-// User calls this.
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHLSLTessellator::TessellateIsoLineDomain( float TessFactor_U_LineDetail, float TessFactor_V_LineDensity )
-{
-    IsoLineHLSLProcessTessFactors(TessFactor_V_LineDensity,TessFactor_U_LineDetail);
-    CHWTessellator::TessellateIsoLineDomain(m_LastComputedTessFactors[0],m_LastComputedTessFactors[1]);
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::IsoLineHLSLProcessTessFactors
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHLSLTessellator::IsoLineHLSLProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail )
-{
-    if( !(TessFactor_V_LineDensity > 0) || // NaN will pass
-        !(TessFactor_U_LineDetail > 0) )
-    {
-        m_LastUnRoundedComputedTessFactors[0] = TessFactor_V_LineDensity;
-        m_LastUnRoundedComputedTessFactors[1] = TessFactor_U_LineDetail;
-        m_LastComputedTessFactors[0] =
-        m_LastComputedTessFactors[1] = 0;
-        return;
-    }
-
-    CleanupFloatTessFactor(TessFactor_V_LineDensity); // clamp to [1.0f..INF], NaN->1.0f
-    CleanupFloatTessFactor(TessFactor_U_LineDetail); // clamp to [1.0f..INF], NaN->1.0f
-
-    ClampTessFactor(TessFactor_U_LineDetail); // clamp unbounded user input based on tessellation mode
-
-    m_LastUnRoundedComputedTessFactors[1] = TessFactor_U_LineDetail;    // Save off TessFactors so they can be returned to app
-
-    if(Pow2Partitioning()||IntegerPartitioning())
-    {
-        RoundUpTessFactor(TessFactor_U_LineDetail);
-    }
-
-    OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING_INTEGER);
-
-    ClampTessFactor(TessFactor_V_LineDensity); // Clamp unbounded user input to integer
-    m_LastUnRoundedComputedTessFactors[0] = TessFactor_V_LineDensity;    // Save off TessFactors so they can be returned to app
-
-    RoundUpTessFactor(TessFactor_V_LineDensity);
-
-    RestorePartitioning();
-
-    // Save off TessFactors so they can be returned to app
-    m_LastComputedTessFactors[0] = TessFactor_V_LineDensity;
-    m_LastComputedTessFactors[1] = TessFactor_U_LineDetail;
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::ClampTessFactor()
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHLSLTessellator::ClampTessFactor(float& TessFactor)
-{
-    if( Pow2Partitioning() )
-    {
-        TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR) );
-    }
-    else if( IntegerPartitioning() )
-    {
-        TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR) );
-    }
-    else if( Odd() )
-    {
-        TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR) );
-    }
-    else // even
-    {
-        TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR) );
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::CleanupFloatTessFactor()
-//---------------------------------------------------------------------------------------------------------------------------------
-static const int exponentMask = 0x7f800000;
-static const int mantissaMask = 0x007fffff;
-void CHLSLTessellator::CleanupFloatTessFactor(float& input)
-{
-    // If input is < 1.0f or NaN, clamp to 1.0f.
-    // In other words, clamp input to [1.0f...+INF]
-    int bits = *(int*)&input;
-    if( ( ( ( bits & exponentMask ) == exponentMask ) && ( bits & mantissaMask ) ) ||// nan?
-        (input < 1.0f) )
-    {
-        input = 1;
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::ClampFloatTessFactorScale()
-//---------------------------------------------------------------------------------------------------------------------------------
-void CHLSLTessellator::ClampFloatTessFactorScale(float& input)
-{
-    // If input is < 0.0f or NaN, clamp to 0.0f.  > 1 clamps to 1.
-    // In other words, clamp input to [0.0f...1.0f]
-    int bits = *(int*)&input;
-    if( ( ( ( bits & exponentMask ) == exponentMask ) && ( bits & mantissaMask ) ) ||// nan?
-        (input < 0.0f) )
-    {
-        input = 0;
-    }
-    else if( input > 1 )
-    {
-        input = 1;
-    }
-}
-
-//---------------------------------------------------------------------------------------------------------------------------------
-// CHLSLTessellator::RoundUpTessFactor()
-//---------------------------------------------------------------------------------------------------------------------------------
-static const int exponentLSB = 0x00800000;
-void CHLSLTessellator::RoundUpTessFactor(float& TessFactor)
-{
-    // Assume TessFactor is in [1.0f..+INF]
-    if( Pow2Partitioning() )
-    {
-        int bits = *(int*)&TessFactor;
-        if( bits & mantissaMask )
-        {
-            *(int*)&TessFactor = (bits & exponentMask) + exponentLSB;
-        }
-    }
-    else if( IntegerPartitioning() )
-    {
-        TessFactor = ceil(TessFactor);
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.h b/src/gallium/drivers/swr/rasterizer/core/tessellator.h
deleted file mode 100644
index 30b6b4fca1e..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/tessellator.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2019 without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file tessellator.h
- *
- * @brief Tessellator fixed function unit interface definition
- *
- ******************************************************************************/
-#pragma once
-
-#include "tessellator.hpp"
-
-struct SWR_TS_TESSELLATED_DATA
-{
-    uint32_t NumPrimitives;
-    uint32_t NumDomainPoints;
-
-    uint32_t* ppIndices[3];
-    float*    pDomainPointsU;
-    float*    pDomainPointsV;
-    // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i]
-};
-
-namespace Tessellator
-{
-    /// Wrapper class for the CHWTessellator reference tessellator from MSFT
-    /// This class will store data not originally stored in CHWTessellator
-    class SWR_TS : private CHWTessellator
-    {
-    private:
-        typedef CHWTessellator SUPER;
-        SWR_TS_DOMAIN          Domain;
-        OSALIGNSIMD(float)     DomainPointsU[MAX_POINT_COUNT];
-        OSALIGNSIMD(float)     DomainPointsV[MAX_POINT_COUNT];
-        uint32_t               NumDomainPoints;
-        OSALIGNSIMD(uint32_t)  Indices[3][MAX_INDEX_COUNT / 3];
-        uint32_t               NumIndices;
-
-    public:
-        void Init(SWR_TS_DOMAIN          tsDomain,
-                  SWR_TS_PARTITIONING    tsPartitioning,
-                  SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology)
-        {
-            static D3D11_TESSELLATOR_PARTITIONING CVT_TS_D3D_PARTITIONING[] = {
-                D3D11_TESSELLATOR_PARTITIONING_INTEGER,         // SWR_TS_INTEGER
-                D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD,  // SWR_TS_ODD_FRACTIONAL
-                D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN, // SWR_TS_EVEN_FRACTIONAL
-                D3D11_TESSELLATOR_PARTITIONING_POW2            // SWR_TS_POW2
-            };
-
-            static D3D11_TESSELLATOR_OUTPUT_PRIMITIVE CVT_TS_D3D_OUTPUT_TOPOLOGY[] = {
-                D3D11_TESSELLATOR_OUTPUT_POINT,        // SWR_TS_OUTPUT_POINT
-                D3D11_TESSELLATOR_OUTPUT_LINE,         // SWR_TS_OUTPUT_LINE
-                D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW,  // SWR_TS_OUTPUT_TRI_CW - inverted logic, because DX
-                D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW // SWR_TS_OUTPUT_TRI_CCW - inverted logic, because DX
-            };
-
-            SUPER::Init(CVT_TS_D3D_PARTITIONING[tsPartitioning],
-                        CVT_TS_D3D_OUTPUT_TOPOLOGY[tsOutputTopology]);
-
-            Domain          = tsDomain;
-            NumDomainPoints = 0;
-            NumIndices      = 0;
-        }
-
-        void Tessellate(const SWR_TESSELLATION_FACTORS& tsTessFactors,
-                        SWR_TS_TESSELLATED_DATA&        tsTessellatedData)
-        {
-            uint32_t IndexDiv = 0;
-            switch (Domain)
-            {
-            case SWR_TS_QUAD:
-                IndexDiv = 3;
-                SUPER::TessellateQuadDomain(
-                    tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL],
-                    tsTessFactors.OuterTessFactors[SWR_QUAD_V_EQ0_TRI_W],
-                    tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY],
-                    tsTessFactors.OuterTessFactors[SWR_QUAD_V_EQ1],
-                    tsTessFactors.InnerTessFactors[SWR_QUAD_U_TRI_INSIDE],
-                    tsTessFactors.InnerTessFactors[SWR_QUAD_V_INSIDE]);
-                break;
-
-            case SWR_TS_TRI:
-                IndexDiv = 3;
-                SUPER::TessellateTriDomain(
-                    tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL],
-                    tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY],
-                    tsTessFactors.OuterTessFactors[SWR_QUAD_V_EQ0_TRI_W],
-                    tsTessFactors.InnerTessFactors[SWR_QUAD_U_TRI_INSIDE]);
-                break;
-
-            case SWR_TS_ISOLINE:
-                IndexDiv = 2;
-                SUPER::TessellateIsoLineDomain(
-                    tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY],
-                    tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL]);
-                break;
-
-            default:
-                SWR_INVALID("Invalid Tessellation Domain: %d", Domain);
-                assert(false);
-            }
-
-            NumDomainPoints = (uint32_t)SUPER::GetPointCount();
-
-            DOMAIN_POINT* pPoints = SUPER::GetPoints();
-            for (uint32_t i = 0; i < NumDomainPoints; i++) {
-                DomainPointsU[i] = pPoints[i].u;
-                DomainPointsV[i] = pPoints[i].v;
-            }
-            tsTessellatedData.NumDomainPoints = NumDomainPoints;
-            tsTessellatedData.pDomainPointsU  = &DomainPointsU[0];
-            tsTessellatedData.pDomainPointsV  = &DomainPointsV[0];
-
-            NumIndices = (uint32_t)SUPER::GetIndexCount();
-
-            assert(NumIndices % IndexDiv == 0);
-            tsTessellatedData.NumPrimitives = NumIndices / IndexDiv;
-
-            uint32_t* pIndices = (uint32_t*)SUPER::GetIndices();
-            for (uint32_t i = 0; i < NumIndices; i++) {
-                Indices[i % IndexDiv][i / IndexDiv] = pIndices[i];
-            }
-
-            tsTessellatedData.ppIndices[0] = &Indices[0][0];
-            tsTessellatedData.ppIndices[1] = &Indices[1][0];
-            tsTessellatedData.ppIndices[2] = &Indices[2][0];
-        }
-    };
-} // namespace Tessellator
-
-/// Allocate and initialize a new tessellation context
-INLINE HANDLE SWR_API
-              TSInitCtx(SWR_TS_DOMAIN          tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle)
-                        SWR_TS_PARTITIONING    tsPartitioning, ///< [IN] Tessellation partitioning algorithm
-                        SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology
-                        void*                  pContextMem, ///< [IN] Memory to use for the context
-                        size_t& memSize) ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required
-{
-    using Tessellator::SWR_TS;
-    SWR_ASSERT(tsDomain < SWR_TS_DOMAIN_COUNT);
-    SWR_ASSERT(tsPartitioning < SWR_TS_PARTITIONING_COUNT);
-    SWR_ASSERT(tsOutputTopology < SWR_TS_OUTPUT_TOPOLOGY_COUNT);
-
-    size_t origMemSize = memSize;
-    memSize            = AlignUp(sizeof(SWR_TS), 64);
-
-    if (nullptr == pContextMem || memSize > origMemSize)
-    {
-        return nullptr;
-    }
-
-    HANDLE tsCtx = pContextMem;
-
-    SWR_TS* pTessellator = new (tsCtx) SWR_TS();
-    SWR_ASSERT(pTessellator == tsCtx);
-
-    pTessellator->Init(tsDomain, tsPartitioning, tsOutputTopology);
-
-    return tsCtx;
-}
-
-/// Destroy & de-allocate tessellation context
-INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx) ///< [IN] Tessellation context to be destroyed
-{
-    using Tessellator::SWR_TS;
-    SWR_TS* pTessellator = (SWR_TS*)tsCtx;
-
-    if (pTessellator)
-    {
-        pTessellator->~SWR_TS();
-    }
-}
-
-/// Perform Tessellation
-INLINE void SWR_API
-            TSTessellate(HANDLE                          tsCtx, ///< [IN] Tessellation Context
-                         const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors
-                         SWR_TS_TESSELLATED_DATA&        tsTessellatedData)    ///< [OUT] Tessellated Data
-{
-    using Tessellator::SWR_TS;
-    SWR_TS* pTessellator = (SWR_TS*)tsCtx;
-    SWR_ASSERT(pTessellator);
-
-    pTessellator->Tessellate(tsTessFactors, tsTessellatedData);
-}
-
diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.hpp b/src/gallium/drivers/swr/rasterizer/core/tessellator.hpp
deleted file mode 100644
index 459c1093d2e..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/tessellator.hpp
+++ /dev/null
@@ -1,471 +0,0 @@
-/*
-    Copyright (c) Microsoft Corporation
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-    associated documentation files (the "Software"), to deal in the Software without restriction,
-    including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-    and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
-    subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in all copies or substantial
-    portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
-    NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#pragma once
-//=================================================================================================================================
-// Microsoft D3D11 Fixed Function Tessellator Reference - May 7, 2012
-// amar.patel@microsoft.com
-//
-// CHWTessellator demonstrates what is expected of hardware in the D3D11 fixed function Tessellator stage.  Hardware
-// implementers need only look at this class.
-//
-// CHLSLTessellator is a wrapper for CHWTessellator, representing the effect of shader code that will
-// be autogenerated by HLSL in the Hull Shader, both for plumbing data around, and to precondition TessFactor values before they
-// are passed to the hardware (such as deriving inside TessFactors from edge TessFactors).  The algorithms used
-// in CHLSLTessellator are subject to change, but since they represent shader code auto-generated by the HLSL compiler,
-// CHLSLTessellator has no effect on hardware design at all.  Note the HLSL compiler will expose all the raw hardware
-// control illustrated by CHWTessellator for those who don't need the helper functionality illustrated by CHLSLTessellator.
-//
-// Usage:        (1) Create either a CHLSLTessellator or CHWTessellator object, depending on which you want to verify.
-//               (2) Call C*Tessellator::Init()
-//               (3) Call C*Tessellator::Tessellate[IsoLine|Tri|Quad]Domain()
-//                      - Here you pass in TessFactors (how much to tessellate)
-//               (4) Call C*Tessellator::GetPointCount(), C*Tessellator::GetIndexCount() to see how much data was generated.
-//               (5) Call C*Tessellator::GetPoints() and C*Tessellator::GetIndices() to get pointers to the data.
-//                   The pointers are fixed for the lifetime of the object (storage for max tessellation),
-//                   so if you ::Tessellate again, the data in the buffers is overwritten.
-//               (6) There are various other Get() methods to retrieve TessFactors that have been processed from
-//                   what you passed in at step 3.  You can retrieve separate TessFactors that the tessellator
-//                   produced after clamping but before rounding, and also after rounding (say in pow2 mode).
-//                   These numbers can be useful information if you are geomorphing displacement maps.
-//               (7) Goto Step 2 or 3 if you want to animate TessFactors or tessellate a different patch
-//
-// Code implementation details:
-//
-// There is lots of headroom to make this code run faster on CPUs.  It was written merely as a reference for
-// what results hardware should produce, with CPU performance not a consideration.  It is nice that this implementation
-// only generates the exact number of vertices needed (no duplicates) in the output vertex buffer.  Also, the number
-// of calculations done for each U/V domain coordinate is minimized by doing some precalculation of some patch or edge
-// invariant numbers (see TESS_FACTOR_CONTEXT).  All the vertex coordinate calculations could be computed with as much
-// parallelism as you like.  Similarly the calculation of connectivity itself is highly parallelizable, and can also
-// be done independent of the vertex calculations.
-//
-//=================================================================================================================================
-
-#define D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR 1
-#define D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR 63
-#define D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR 2
-#define D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR 64
-
-#define D3D11_TESSELLATOR_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR 1
-#define D3D11_TESSELLATOR_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR 64
-
-#define D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR 64 // max of even and odd tessFactors
-
-#define MAX_POINT_COUNT ((D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR+1)*(D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR+1))
-#define MAX_INDEX_COUNT (D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR*D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR*2*3)
-
-//=================================================================================================================================
-// Data types for the caller
-//=================================================================================================================================
-enum D3D11_TESSELLATOR_PARTITIONING
-{
-    D3D11_TESSELLATOR_PARTITIONING_INTEGER,
-    D3D11_TESSELLATOR_PARTITIONING_POW2,
-    D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD,
-    D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN
-};
-
-enum D3D11_TESSELLATOR_REDUCTION
-{
-    D3D11_TESSELLATOR_REDUCTION_MIN,
-    D3D11_TESSELLATOR_REDUCTION_MAX,
-    D3D11_TESSELLATOR_REDUCTION_AVERAGE
-};
-
-enum D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS
-{
-    D3D11_TESSELLATOR_QUAD_REDUCTION_1_AXIS,
-    D3D11_TESSELLATOR_QUAD_REDUCTION_2_AXIS
-};
-
-enum D3D11_TESSELLATOR_OUTPUT_PRIMITIVE
-{
-    D3D11_TESSELLATOR_OUTPUT_POINT,
-    D3D11_TESSELLATOR_OUTPUT_LINE,
-    D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW,
-    D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW,
-};
-
-typedef struct DOMAIN_POINT
-{
-    float u;
-    float v; // for tri, w = 1 - u - v;
-} DOMAIN_POINT;
-
-//=================================================================================================================================
-// CHWTessellator: D3D11 Tessellation Fixed Function Hardware Reference
-//=================================================================================================================================
-typedef unsigned int FXP; // fixed point number
-
-class CHWTessellator
-{
-
-//---------------------------------------------------------------------------------------------------------------------------------
-public:
-    void Init( D3D11_TESSELLATOR_PARTITIONING         partitioning,
-               D3D11_TESSELLATOR_OUTPUT_PRIMITIVE     outputPrimitive);
-
-    void TessellateIsoLineDomain( float TessFactor_V_LineDensity,
-                                  float TessFactor_U_LineDetail );
-
-    void TessellateTriDomain( float TessFactor_Ueq0,
-                              float TessFactor_Veq0,
-                              float TessFactor_Weq0,
-                              float TessFactor_Inside );
-
-    void TessellateQuadDomain( float TessFactor_Ueq0,
-                               float TessFactor_Veq0,
-                               float TessFactor_Ueq1,
-                               float TessFactor_Veq1,
-                               float TessFactor_InsideU,
-                               float TessFactor_InsideV );
-
-    int GetPointCount();
-    int GetIndexCount();
-
-    DOMAIN_POINT* GetPoints(); // Get CHWTessellator owned pointer to vertices (UV values).
-                               // Pointer is fixed for lifetime of CHWTessellator object.
-    int* GetIndices();         // Get CHWTessellator owned pointer to vertex indices.
-                               // Pointer is fixed for lifetime of CHWTessellator object.
-
-#define ALLOW_XBOX_360_COMPARISON // Different vertex splitting order. This is NOT D3D11 behavior, just available here for comparison.
-	                              // Setting this define true just allows the XBox split style to be enabled via
-	                              // SetXBox360Mode() below, but by default this XBox360 mode still always starts off DISABLED.
-								  // The XBox360 always splits from the center of an edge (D3D11 uses ruler function).  Splitting
-	                              // from the center causes sliver triangles in transition areas, which cause numerous problems.
-                                  // Note the XBox360 only supports adaptive tessellation via fractional_even partitioning,
-                                  // though this #define lets you try the XBox vertex splitting order with any of the
-                                  // partitioning modes: even, odd, integer or pow2.
-#ifdef ALLOW_XBOX_360_COMPARISON
-    void SetXBox360Mode(bool bXboxMode) {m_bXBox360Mode = bXboxMode;}
-#endif
-    CHWTessellator();
-    ~CHWTessellator();
-//---------------------------------------------------------------------------------------------------------------------------------
-    //=============================================================================================================================
-    // Some defines so that numbers are usually self commenting
-    //=============================================================================================================================
-    static const int U = 0; // points on a tri patch
-    static const int V = 1;
-    static const int W = 2;
-    static const int Ueq0 = 0; // edges on a tri patch
-    static const int Veq0 = 1;
-    static const int Weq0 = 2;
-
-    static const int Ueq1 = 2; // edges on a quad patch: Ueq0, Veq0, Ueq1, Veq1
-    static const int Veq1 = 3;
-
-    static const int QUAD_AXES = 2;
-    static const int QUAD_EDGES = 4;
-    static const int TRI_EDGES = 3;
-    //=============================================================================================================================
-
-    enum TESSELLATOR_PARITY // derived from D3D11_TESSELLATOR_PARTITIONING
-    {                               // (note: for integer tessellation, both parities are used)
-        TESSELLATOR_PARITY_EVEN,
-        TESSELLATOR_PARITY_ODD
-    };
-private:
-    TESSELLATOR_PARITY                   m_originalParity; // user chosen parity
-    TESSELLATOR_PARITY                   m_parity; // current parity: if allowing mix of even/odd during discrete
-                                                   // tessellation, this can vary from the user defined parity
-    D3D11_TESSELLATOR_PARTITIONING       m_originalPartitioning; // user chosen partitioning
-    D3D11_TESSELLATOR_PARTITIONING       m_partitioning; // current partitioning.  IsoLines overrides for line density
-    D3D11_TESSELLATOR_OUTPUT_PRIMITIVE   m_outputPrimitive;
-    DOMAIN_POINT*                        m_Point; // array where we will store u/v's for the points we generate
-    int*                                 m_Index; // array where we will store index topology
-    int                                  m_NumPoints;
-    int                                  m_NumIndices;
-#ifdef ALLOW_XBOX_360_COMPARISON
-    bool                                 m_bXBox360Mode;
-#endif
-    // PlacePointIn1D below is the workhorse for all position placement.
-    // It is code that could run as preamble in a Domain Shader, so the tessellator itself
-    // doesn't necessarily need to have floating point.
-    // Some per-TessFactor fixed context is needed, and that can be computed wherever
-    // the TessFactor reduction is done, perhaps as Hull Shader postamble - this is shared
-    // for all point evaluation.
-    typedef struct TESS_FACTOR_CONTEXT
-    {
-        FXP fxpInvNumSegmentsOnFloorTessFactor;
-        FXP fxpInvNumSegmentsOnCeilTessFactor;
-        FXP fxpHalfTessFactorFraction;
-        int numHalfTessFactorPoints;
-        int splitPointOnFloorHalfTessFactor;
-    } TESS_FACTOR_CONTEXT;
-    void ComputeTessFactorContext( FXP fxpTessFactor, TESS_FACTOR_CONTEXT& TessFactorCtx );
-    void PlacePointIn1D( const TESS_FACTOR_CONTEXT& TessFactorCtx, int point, FXP& fxpLocation );
-
-    int NumPointsForTessFactor(FXP fxpTessFactor);
-
-    // Tessellation parity control
-    bool Odd() {return (m_parity == TESSELLATOR_PARITY_ODD) ? true : false;}
-    void SetTessellationParity(TESSELLATOR_PARITY parity) {m_parity = parity;}
-
-    // HWIntegerPartitioning() - hardware doesn't care about what pow2 partitioning is - the query below is true for
-    //                           both integer and pow2.
-    bool HWIntegerPartitioning() {return ((m_partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)||
-                                          (m_partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)) ? true : false;}
-
-    // Tesselation Partitioning control
-    void RestorePartitioning() {m_partitioning = m_originalPartitioning;};
-    void OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING partitioning) {m_partitioning = partitioning;} //isoline uses this for density
-
-    // Call these to generate new points and indices.  Max TessFactor storage is already allocated.
-    int DefinePoint(FXP u, FXP v, int pointStorageOffset);
-    void DefineIndex(int index, int indexStorageOffset);
-    void DefineClockwiseTriangle(int index0, int index1, int index2, int indexStorageBaseOffset);
-
-    // Couple of trivial ways to generate index data just given points and no other connectivity.
-    void DumpAllPoints();                  // Make point indices for point rendering mode -
-                                           // redundant, but just here for orthogonality.
-    void DumpAllPointsAsInOrderLineList(); // A debug visualization of all the points connected
-                                           // in the order they were generated.
-                                           // Asking to draw line topology on a tri or quad patch will do this
-
-
-    // The structures below define the data that is derived given input TessFactors and which
-    // is used by point generation and connectivity generation steps (each of which are independent)
-    typedef struct PROCESSED_TESS_FACTORS_ISOLINE
-    {
-        TESSELLATOR_PARITY lineDensityParity;
-        TESSELLATOR_PARITY lineDetailParity;
-        TESS_FACTOR_CONTEXT lineDensityTessFactorCtx;
-        TESS_FACTOR_CONTEXT lineDetailTessFactorCtx;
-        bool bPatchCulled;
-        int numPointsPerLine;
-        int numLines;
-    } PROCESSED_TESS_FACTORS_ISOLINE;
-    typedef struct PROCESSED_TESS_FACTORS_TRI
-    {
-        FXP outsideTessFactor[TRI_EDGES];
-        FXP insideTessFactor;
-        TESSELLATOR_PARITY outsideTessFactorParity[TRI_EDGES];
-        TESSELLATOR_PARITY insideTessFactorParity;
-        TESS_FACTOR_CONTEXT outsideTessFactorCtx[TRI_EDGES];
-        TESS_FACTOR_CONTEXT insideTessFactorCtx;
-        bool bJustDoMinimumTessFactor;
-        bool bPatchCulled;
-        // Stuff below is just specific to the traversal order
-        // this code happens to use to generate points/lines
-        int numPointsForOutsideEdge[TRI_EDGES];
-        int numPointsForInsideTessFactor;
-        int insideEdgePointBaseOffset;
-    } PROCESSED_TESS_FACTORS_TRI;
-    typedef struct PROCESSED_TESS_FACTORS_QUAD
-    {
-        FXP outsideTessFactor[QUAD_EDGES];
-        FXP insideTessFactor[QUAD_AXES];
-        TESSELLATOR_PARITY outsideTessFactorParity[QUAD_EDGES];
-        TESSELLATOR_PARITY insideTessFactorParity[QUAD_AXES];
-        TESS_FACTOR_CONTEXT outsideTessFactorCtx[QUAD_EDGES];
-        TESS_FACTOR_CONTEXT insideTessFactorCtx[QUAD_AXES];
-        bool bJustDoMinimumTessFactor;
-        bool bPatchCulled;
-        // Stuff below is just specific to the traversal order
-        // this code happens to use to generate points/lines
-        int numPointsForOutsideEdge[QUAD_EDGES];
-        int numPointsForInsideTessFactor[QUAD_AXES];
-        int insideEdgePointBaseOffset;
-    } PROCESSED_TESS_FACTORS_QUAD;
-
-    // These are the workhorse functions for tessellation:
-    // (1) Process input TessFactors
-    // (2) Generate points
-    // (3) Generate connectivity (can be done in parallel to (2))
-    void IsoLineProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail, PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors );
-    void IsoLineGeneratePoints( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors );
-    void IsoLineGenerateConnectivity( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors );
-    void TriProcessTessFactors( float tessFactor_Ueq0, float TessFactor_Veq0, float TessFactor_Weq0, float insideTessFactor, PROCESSED_TESS_FACTORS_TRI& processedTessFactors );
-    void TriGeneratePoints( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors );
-    void TriGenerateConnectivity( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors );
-    void QuadProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1,
-                               float insideTessFactor_U, float insideTessFactor_V, PROCESSED_TESS_FACTORS_QUAD& processedTessFactors );
-    void QuadGeneratePoints( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors );
-    void QuadGenerateConnectivity( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors );
-
-    // Stitching
-    // ---------
-    // Given pointers to the beginning of 2 parallel rows of points, and TessFactors for each, stitch them.
-    // The assumption is the stitch is symmetric.
-    void StitchTransition(int baseIndexOffset, int insideEdgePointBaseOffset, int insideNumHalfTessFactorPoints,
-                                               TESSELLATOR_PARITY insideEdgeTessFactorParity,
-                                               int outsideEdgePointBaseOffset, int outsideNumHalfTessFactorPoints,
-                                               TESSELLATOR_PARITY outsideEdgeTessFactorParity );
-    // The interior can just use a simpler stitch.
-    enum DIAGONALS
-    {
-        DIAGONALS_INSIDE_TO_OUTSIDE,
-        DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE,
-        DIAGONALS_MIRRORED
-    };
-
-    void StitchRegular(bool bTrapezoid, DIAGONALS diagonals, int baseIndexOffset, int numInsideEdgePoints,
-                                        int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset);
-
-//---------------------------------------------------------------------------------------------------------------------------------
-    // Index Patching
-    // --------------
-    // The code below patches index values produces during triangulation, so triangulation doesn't have to know
-    // where points should go.  I happened to never produce duplicate vertices, but the patching would
-    // be simpler if some duplicate vertices were introduced in practice.  During point rendering mode however,
-    // it is not permitted for duplicate points to show up.
-
-    // Since the points are generated in concentric rings, most of the time, the point locations are
-    // sequentially increasing in memory for each side of a ring, which the stitch can take advantage of.
-    // However, there are exceptions where the points are not sequentially increasing, such as
-    // the 4th row in a given ring, where the last point on the outside of each row is actually the beginning
-    // point.
-    // So we let the stitching code think it sees sequential vertices, and when it emits a vertex index,
-    // we patch it to be the real location.
-    int  PatchIndexValue(int index);
-    typedef struct INDEX_PATCH_CONTEXT
-    {
-        int insidePointIndexDeltaToRealValue;
-        int insidePointIndexBadValue;
-        int insidePointIndexReplacementValue;
-        int outsidePointIndexPatchBase;
-        int outsidePointIndexDeltaToRealValue;
-        int outsidePointIndexBadValue;
-        int outsidePointIndexReplacementValue;
-    } INDEX_PATCH_CONTEXT;
-    void SetUsingPatchedIndices(bool bUsingPatchedIndices) {m_bUsingPatchedIndices = bUsingPatchedIndices;}
-
-    // A second index patch we have to do handles the leftover strip of quads in the middle of an odd quad patch after
-    // finishing all the concentric rings.
-    // This also handles the leftover strip of points in the middle of an even quad
-    // patch, when stitching the row of triangles up the left side (V major quad) or bottom (U major quad) of the
-    // inner ring
-    typedef struct INDEX_PATCH_CONTEXT2
-    {
-        int baseIndexToInvert;
-        int indexInversionEndPoint;
-        int cornerCaseBadValue;
-        int cornerCaseReplacementValue;
-    } INDEX_PATCH_CONTEXT2;
-    void SetUsingPatchedIndices2(bool bUsingPatchedIndices) {m_bUsingPatchedIndices2 = bUsingPatchedIndices;}
-    bool                                 m_bUsingPatchedIndices;
-    bool                                 m_bUsingPatchedIndices2;
-    INDEX_PATCH_CONTEXT                  m_IndexPatchContext;
-    INDEX_PATCH_CONTEXT2                 m_IndexPatchContext2;
-
-};
-
-//=================================================================================================================================
-// CHLSLTessellator: D3D11 Tessellation HLSL Tessellator Interface
-// Demonstrates TessFactor preconditioning code auto-generated by HLSL.  Subject to change, but this
-// just represents the effect of shader code the HLSL compiler will generate in the Hull Shader,
-// so it does not affect hardware design at all.
-//=================================================================================================================================
-class CHLSLTessellator : public CHWTessellator
-{
-public:
-    void Init( D3D11_TESSELLATOR_PARTITIONING         partitioning,
-               D3D11_TESSELLATOR_REDUCTION            insideTessFactorReduction,
-               D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS  quadInsideTessFactorReductionAxis,
-               D3D11_TESSELLATOR_OUTPUT_PRIMITIVE     outputPrimitive);
-
-    void TessellateIsoLineDomain( float TessFactor_V_LineDensity,
-                                  float TessFactor_U_LineDetail );
-
-    void TessellateTriDomain( float tessFactor_Ueq0,
-                              float TessFactor_Veq0,
-                              float TessFactor_Weq0,
-                              float insideTessFactorScale /*[0..1]*/ );
-
-    void TessellateQuadDomain( float TessFactorUeq0,
-                               float TessFactorVeq0,
-                               float TessFactorUeq1,
-                               float TessFactorVeq1,
-                               float insideTessFactorScaleU /*[0..1]*/,
-                               float insideTessFactorScaleV /*[0..1]*/ );
-
-    int GetPointCount() {return CHWTessellator::GetPointCount();};
-    int GetIndexCount() {return CHWTessellator::GetIndexCount();}
-
-    DOMAIN_POINT* GetPoints() {return CHWTessellator::GetPoints();} // Get CHLSLTessellator owned pointer to vertices (UV values).
-                               // Pointer is fixed for lifetime of CHLSLTessellator object.
-    int* GetIndices() {return CHWTessellator::GetIndices();}         // Get CHLSLTessellator owned pointer to vertex indices.
-                               // Pointer is fixed for lifetime of CHLSLTessellator object.
-
-    // Retrieve TessFactors actually used by the "hardware"
-    // This includes clamping to valid range, and more interestingly
-    // if integer or pow2 partitioning is being done, the rounded TessFactors can be retrieved.
-    // Getting the rounded TessFactors can be useful for geomorphing of displacement maps.
-    float GetIsoLineDensityTessFactor() {return m_LastComputedTessFactors[0];}
-    float GetIsoLineDetailTessFactor() {return m_LastComputedTessFactors[1];}
-    float GetTriUeq0TessFactor() {return m_LastComputedTessFactors[0];}
-    float GetTriVeq0TessFactor() {return m_LastComputedTessFactors[1];}
-    float GetTriWeq0TessFactor() {return m_LastComputedTessFactors[2];}
-    float GetTriInsideTessFactor() {return m_LastComputedTessFactors[3];}
-    float GetQuadUeq0TessFactor() {return m_LastComputedTessFactors[0];}
-    float GetQuadVeq0TessFactor() {return m_LastComputedTessFactors[1];}
-    float GetQuadUeq1TessFactor() {return m_LastComputedTessFactors[2];}
-    float GetQuadVeq1TessFactor() {return m_LastComputedTessFactors[3];}
-    float GetQuadInsideUTessFactor() {return m_LastComputedTessFactors[4];}
-    float GetQuadInsideVTessFactor() {return m_LastComputedTessFactors[5];}
-    float GetUnRoundedIsoLineDensityTessFactor() {return m_LastUnRoundedComputedTessFactors[0];}
-    float GetUnRoundedIsoLineDetailTessFactor() {return m_LastUnRoundedComputedTessFactors[1];}
-    float GetUnRoundedTriUeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[0];}
-    float GetUnRoundedTriVeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[1];}
-    float GetUnRoundedTriWeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[2];}
-    float GetUnRoundedTriInsideTessFactor() {return m_LastUnRoundedComputedTessFactors[3];}
-    float GetUnRoundedQuadUeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[0];}
-    float GetUnRoundedQuadVeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[1];}
-    float GetUnRoundedQuadUeq1TessFactor() {return m_LastUnRoundedComputedTessFactors[2];}
-    float GetUnRoundedQuadVeq1TessFactor() {return m_LastUnRoundedComputedTessFactors[3];}
-    float GetUnRoundedQuadInsideUTessFactor() {return m_LastUnRoundedComputedTessFactors[4];}
-    float GetUnRoundedQuadInsideVTessFactor() {return m_LastUnRoundedComputedTessFactors[5];}
-
-    CHLSLTessellator();
-//---------------------------------------------------------------------------------------------------------------------------------
-private:
-    TESSELLATOR_PARITY                   m_originalParity; // user chosen parity
-    TESSELLATOR_PARITY                   m_parity; // current parity: if allowing mix of even/odd during discrete
-                                                   // tessellation, this can vary from the user defined parity
-    D3D11_TESSELLATOR_PARTITIONING       m_originalPartitioning; // user chosen partitioning
-    D3D11_TESSELLATOR_PARTITIONING       m_partitioning; // current partitioning.  IsoLines overrides for line density
-    D3D11_TESSELLATOR_OUTPUT_PRIMITIVE   m_outputPrimitive;
-    D3D11_TESSELLATOR_REDUCTION          m_insideTessFactorReduction;
-    D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS m_quadInsideTessFactorReductionAxis;
-    float                                m_LastComputedTessFactors[6]; // TessFactors used for last tessellation
-    float                                m_LastUnRoundedComputedTessFactors[6]; // TessFactors used for last tessellation (before they were rounded)
-    bool IntegerPartitioning() {return (m_partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER) ? true : false;}
-    bool Pow2Partitioning() {return (m_partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)? true : false;}
-    void ClampTessFactor(float& TessFactor);
-    void RoundUpTessFactor(float& TessFactor);
-    void CleanupFloatTessFactor(float& input); // clamp float to [1.0f... +INF] (incl NaN->1.0f)
-    void ClampFloatTessFactorScale(float& input); // clamp float to [0.0f... +INF] (incl NaN->0.0f)
-
-    // Tessellation parity control
-    bool Odd() {return (m_parity == TESSELLATOR_PARITY_ODD) ? true : false;}
-    void SetTessellationParity(TESSELLATOR_PARITY parity) {m_parity = parity;}
-
-    // Tesselation Partitioning control
-    void RestorePartitioning() {m_partitioning = m_originalPartitioning;};
-    void OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING partitioning) {m_partitioning = partitioning;} //isoline uses this for density
-
-    void IsoLineHLSLProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail );
-    void TriHLSLProcessTessFactors( float tessFactor_Ueq0, float TessFactor_Veq0, float TessFactor_Weq0, float insideTessFactor );
-    void QuadHLSLProcessTessFactors( float TessFactor_Ueq0, float TessFactor_Veq0, float TessFactor_Ueq1, float TessFactor_Veq1,
-                               float insideTessFactor_U, float insideTessFactor_V );
-
-};
-
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
deleted file mode 100644
index 8d4104f0af1..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ /dev/null
@@ -1,1423 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#include <stdio.h>
-#include <thread>
-#include <algorithm>
-#include <float.h>
-#include <vector>
-#include <utility>
-#include <fstream>
-#include <string>
-
-#if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
-#include <pthread.h>
-#include <sched.h>
-#include <unistd.h>
-#endif
-
-#ifdef __APPLE__
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#endif
-
-#include "common/os.h"
-#include "core/api.h"
-#include "context.h"
-#include "frontend.h"
-#include "backend.h"
-#include "rasterizer.h"
-#include "rdtsc_core.h"
-#include "tilemgr.h"
-#include "tileset.h"
-
-
-// ThreadId
-struct Core
-{
-    uint32_t              procGroup = 0;
-    std::vector<uint32_t> threadIds;
-};
-
-struct NumaNode
-{
-    uint32_t          numaId;
-    std::vector<Core> cores;
-};
-
-typedef std::vector<NumaNode> CPUNumaNodes;
-
-void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
-{
-    out_nodes.clear();
-    out_numThreadsPerProcGroup = 0;
-
-#if defined(_WIN32)
-
-    std::vector<KAFFINITY> threadMaskPerProcGroup;
-
-    static std::mutex           m;
-    std::lock_guard<std::mutex> l(m);
-
-    DWORD bufSize = 0;
-
-    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
-    SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
-
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem =
-        (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
-    SWR_ASSERT(pBufferMem);
-
-    ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
-    SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
-
-    uint32_t                                 count   = bufSize / pBufferMem->Size;
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
-
-    for (uint32_t i = 0; i < count; ++i)
-    {
-        SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
-        for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
-        {
-            auto&    gmask     = pBuffer->Processor.GroupMask[g];
-            uint32_t threadId  = 0;
-            uint32_t procGroup = gmask.Group;
-
-            Core* pCore = nullptr;
-
-            while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
-            {
-                // clear mask
-                KAFFINITY threadMask = KAFFINITY(1) << threadId;
-                gmask.Mask &= ~threadMask;
-
-                if (procGroup >= threadMaskPerProcGroup.size())
-                {
-                    threadMaskPerProcGroup.resize(procGroup + 1);
-                }
-
-                if (threadMaskPerProcGroup[procGroup] & threadMask)
-                {
-                    // Already seen this mask.  This means that we are in 32-bit mode and
-                    // have seen more than 32 HW threads for this procGroup
-                    // Don't use it
-#if defined(_WIN64)
-                    SWR_INVALID("Shouldn't get here in 64-bit mode");
-#endif
-                    continue;
-                }
-
-                threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
-
-                // Find Numa Node
-                uint32_t         numaId  = 0;
-                PROCESSOR_NUMBER procNum = {};
-                procNum.Group            = WORD(procGroup);
-                procNum.Number           = UCHAR(threadId);
-
-                ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
-                SWR_ASSERT(ret);
-
-                // Store data
-                if (out_nodes.size() <= numaId)
-                {
-                    out_nodes.resize(numaId + 1);
-                }
-                auto& numaNode  = out_nodes[numaId];
-                numaNode.numaId = numaId;
-
-                if (nullptr == pCore)
-                {
-                    numaNode.cores.push_back(Core());
-                    pCore            = &numaNode.cores.back();
-                    pCore->procGroup = procGroup;
-                }
-                pCore->threadIds.push_back(threadId);
-                if (procGroup == 0)
-                {
-                    out_numThreadsPerProcGroup++;
-                }
-            }
-        }
-        pBuffer = PtrAdd(pBuffer, pBuffer->Size);
-    }
-
-    free(pBufferMem);
-
-#elif defined(__linux__) || defined(__gnu_linux__)
-
-    // Parse /proc/cpuinfo to get full topology
-    std::ifstream input("/proc/cpuinfo");
-    std::string   line;
-    char*         c;
-    uint32_t      procId = uint32_t(-1);
-    uint32_t      coreId = uint32_t(-1);
-    uint32_t      physId = uint32_t(-1);
-
-    while (std::getline(input, line))
-    {
-        if (line.find("processor") != std::string::npos)
-        {
-            auto data_start = line.find(": ") + 2;
-            procId          = std::strtoul(&line.c_str()[data_start], &c, 10);
-            continue;
-        }
-        if (line.find("core id") != std::string::npos)
-        {
-            auto data_start = line.find(": ") + 2;
-            coreId          = std::strtoul(&line.c_str()[data_start], &c, 10);
-            continue;
-        }
-        if (line.find("physical id") != std::string::npos)
-        {
-            auto data_start = line.find(": ") + 2;
-            physId          = std::strtoul(&line.c_str()[data_start], &c, 10);
-            continue;
-        }
-        if (line.length() == 0)
-        {
-            if (physId + 1 > out_nodes.size())
-                out_nodes.resize(physId + 1);
-            auto& numaNode  = out_nodes[physId];
-            numaNode.numaId = physId;
-
-            if (coreId + 1 > numaNode.cores.size())
-                numaNode.cores.resize(coreId + 1);
-            auto& core     = numaNode.cores[coreId];
-            core.procGroup = coreId;
-            core.threadIds.push_back(procId);
-        }
-    }
-
-    out_numThreadsPerProcGroup = 0;
-    for (auto& node : out_nodes)
-    {
-        for (auto& core : node.cores)
-        {
-            out_numThreadsPerProcGroup += core.threadIds.size();
-        }
-    }
-
-#elif defined(__APPLE__)
-
-    auto numProcessors  = 0;
-    auto numCores       = 0;
-    auto numPhysicalIds = 0;
-
-    int    value;
-    size_t size = sizeof(value);
-
-    int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
-    SWR_ASSERT(result == 0);
-    numPhysicalIds = value;
-
-    result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
-    SWR_ASSERT(result == 0);
-    numProcessors = value;
-
-    result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
-    SWR_ASSERT(result == 0);
-    numCores = value;
-
-    out_nodes.resize(numPhysicalIds);
-
-    for (auto physId = 0; physId < numPhysicalIds; ++physId)
-    {
-        auto& numaNode = out_nodes[physId];
-        auto  procId   = 0;
-
-        numaNode.cores.resize(numCores);
-
-        while (procId < numProcessors)
-        {
-            for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
-            {
-                auto& core = numaNode.cores[coreId];
-
-                core.procGroup = coreId;
-                core.threadIds.push_back(procId);
-            }
-        }
-    }
-
-    out_numThreadsPerProcGroup = 0;
-
-    for (auto& node : out_nodes)
-    {
-        for (auto& core : node.cores)
-        {
-            out_numThreadsPerProcGroup += core.threadIds.size();
-        }
-    }
-
-#else
-
-#error Unsupported platform
-
-#endif
-
-    // Prune empty cores and numa nodes
-    for (auto node_it = out_nodes.begin(); node_it != out_nodes.end();)
-    {
-        // Erase empty cores (first)
-        for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end();)
-        {
-            if (core_it->threadIds.size() == 0)
-            {
-                core_it = node_it->cores.erase(core_it);
-            }
-            else
-            {
-                ++core_it;
-            }
-        }
-
-        // Erase empty numa nodes (second)
-        if (node_it->cores.size() == 0)
-        {
-            node_it = out_nodes.erase(node_it);
-        }
-        else
-        {
-            ++node_it;
-        }
-    }
-}
-
-void bindThread(SWR_CONTEXT* pContext,
-                uint32_t     threadId,
-                uint32_t     procGroupId   = 0,
-                bool         bindProcGroup = false)
-{
-    // Only bind threads when MAX_WORKER_THREADS isn't set.
-    if (pContext->threadInfo.SINGLE_THREADED ||
-        (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
-    {
-        return;
-    }
-
-#if defined(_WIN32)
-
-    GROUP_AFFINITY affinity = {};
-    affinity.Group          = procGroupId;
-
-#if !defined(_WIN64)
-    if (threadId >= 32)
-    {
-        // Hopefully we don't get here.  Logic in CreateThreadPool should prevent this.
-        SWR_INVALID("Shouldn't get here");
-
-        // In a 32-bit process on Windows it is impossible to bind
-        // to logical processors 32-63 within a processor group.
-        // In this case set the mask to 0 and let the system assign
-        // the processor.  Hopefully it will make smart choices.
-        affinity.Mask = 0;
-    }
-    else
-#endif
-    {
-        // If MAX_WORKER_THREADS is set, only bind to the proc group,
-        // Not the individual HW thread.
-        if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
-        {
-            affinity.Mask = KAFFINITY(1) << threadId;
-        }
-        else
-        {
-            affinity.Mask = KAFFINITY(0);
-        }
-    }
-
-    if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
-    {
-        SWR_INVALID("Failed to set Thread Affinity");
-    }
-
-#elif defined(__linux__) || defined(__gnu_linux__)
-
-    cpu_set_t cpuset;
-    pthread_t thread = pthread_self();
-    CPU_ZERO(&cpuset);
-    CPU_SET(threadId, &cpuset);
-
-    int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
-    if (err != 0)
-    {
-        fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err));
-    }
-
-#endif
-}
-
-INLINE
-uint32_t GetEnqueuedDraw(SWR_CONTEXT* pContext)
-{
-    return pContext->dcRing.GetHead();
-}
-
-INLINE
-DRAW_CONTEXT* GetDC(SWR_CONTEXT* pContext, uint32_t drawId)
-{
-    return &pContext->dcRing[(drawId - 1) % pContext->MAX_DRAWS_IN_FLIGHT];
-}
-
-INLINE
-bool IDComparesLess(uint32_t a, uint32_t b)
-{
-    // Use signed delta to ensure that wrap-around to 0 is correctly handled.
-    int32_t delta = int32_t(a - b);
-    return (delta < 0);
-}
-
-// returns true if dependency not met
-INLINE
-bool CheckDependency(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
-{
-    return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
-}
-
-bool CheckDependencyFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
-{
-    return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Update client stats.
-INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
-{
-    if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
-    {
-        return;
-    }
-
-    DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
-    OSALIGNLINE(SWR_STATS) stats{0};
-
-    // Sum up stats across all workers before sending to client.
-    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
-    {
-        stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
-        stats.PsInvocations += dynState.pStats[i].PsInvocations;
-        stats.CsInvocations += dynState.pStats[i].CsInvocations;
-
-    }
-
-
-    pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
-}
-
-INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
-{
-    UpdateClientStats(pContext, workerId, pDC);
-
-    if (pDC->retireCallback.pfnCallbackFunc)
-    {
-        pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
-                                            pDC->retireCallback.userData2,
-                                            pDC->retireCallback.userData3);
-
-        // Callbacks to external code *could* change floating point control state
-        // Reset our optimal flags
-        SetOptimalVectorCSR();
-    }
-}
-
-// inlined-only version
-INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
-{
-    int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
-    SWR_ASSERT(result >= 0);
-
-    AR_FLUSH(pDC->drawId);
-
-    if (result == 0)
-    {
-        ExecuteCallbacks(pContext, workerId, pDC);
-
-
-        // Cleanup memory allocations
-        pDC->pArena->Reset(true);
-        if (!pDC->isCompute)
-        {
-            pDC->pTileMgr->initialize();
-        }
-        if (pDC->cleanupState)
-        {
-            pDC->pState->pArena->Reset(true);
-        }
-
-        _ReadWriteBarrier();
-
-        pContext->dcRing.Dequeue(); // Remove from tail
-    }
-
-    return result;
-}
-
-// available to other translation modules
-int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
-{
-    return CompleteDrawContextInl(pContext, 0, pDC);
-}
-
-INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext,
-                                    uint32_t     workerId,
-                                    uint32_t&    curDrawBE,
-                                    uint32_t&    drawEnqueued)
-{
-    // increment our current draw id to the first incomplete draw
-    drawEnqueued = GetEnqueuedDraw(pContext);
-    while (IDComparesLess(curDrawBE, drawEnqueued))
-    {
-        DRAW_CONTEXT* pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
-
-        // If its not compute and FE is not done then break out of loop.
-        if (!pDC->doneFE && !pDC->isCompute)
-            break;
-
-        bool isWorkComplete =
-            pDC->isCompute ? pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
-
-        if (isWorkComplete)
-        {
-            curDrawBE++;
-            CompleteDrawContextInl(pContext, workerId, pDC);
-        }
-        else
-        {
-            break;
-        }
-    }
-
-    // If there are no more incomplete draws then return false.
-    return IDComparesLess(curDrawBE, drawEnqueued);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief If there is any BE work then go work on it.
-/// @param pContext - pointer to SWR context.
-/// @param workerId - The unique worker ID that is assigned to this thread.
-/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
-/// thread
-///                    has its own curDrawBE counter and this ensures that each worker processes all
-///                    the draws in order.
-/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
-///                      own set and each time it fails to lock a macrotile, because its already
-///                      locked, then it will add that tile to the lockedTiles set. As a worker
-///                      begins to work on future draws the lockedTiles ensure that it doesn't work
-///                      on tiles that may still have work pending in a previous draw. Additionally,
-///                      the lockedTiles is heuristic that can steer a worker back to the same
-///                      macrotile that it had been working on in a previous draw.
-/// @returns        true if worker thread should shutdown
-bool WorkOnFifoBE(SWR_CONTEXT* pContext,
-                  uint32_t     workerId,
-                  uint32_t&    curDrawBE,
-                  TileSet&     lockedTiles,
-                  uint32_t     numaNode,
-                  uint32_t     numaMask)
-{
-    bool bShutdown = false;
-
-    // Find the first incomplete draw that has pending work. If no such draw is found then
-    // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
-    uint32_t drawEnqueued = 0;
-    if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
-    {
-        return false;
-    }
-
-    uint32_t lastRetiredDraw =
-        pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
-
-    // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
-    lockedTiles.clear();
-
-    // Try to work on each draw in order of the available draws in flight.
-    //   1. If we're on curDrawBE, we can work on any macrotile that is available.
-    //   2. If we're trying to work on draws after curDrawBE, we are restricted to
-    //      working on those macrotiles that are known to be complete in the prior draw to
-    //      maintain order. The locked tiles provides the history to ensures this.
-    for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
-    {
-        DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
-
-        if (pDC->isCompute)
-            return false; // We don't look at compute work.
-
-        // First wait for FE to be finished with this draw. This keeps threading model simple
-        // but if there are lots of bubbles between draws then serializing FE and BE may
-        // need to be revisited.
-        if (!pDC->doneFE)
-            return false;
-
-        // If this draw is dependent on a previous draw then we need to bail.
-        if (CheckDependency(pContext, pDC, lastRetiredDraw))
-        {
-            return false;
-        }
-
-        // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
-        auto& macroTiles = pDC->pTileMgr->getDirtyTiles();
-
-        for (auto tile : macroTiles)
-        {
-            uint32_t tileID = tile->mId;
-
-            // Only work on tiles for this numa node
-            uint32_t x, y;
-            pDC->pTileMgr->getTileIndices(tileID, x, y);
-            if (((x ^ y) & numaMask) != numaNode)
-            {
-                _mm_pause();
-                continue;
-            }
-
-            if (!tile->getNumQueued())
-            {
-                _mm_pause();
-                continue;
-            }
-
-            // can only work on this draw if it's not in use by other threads
-            if (lockedTiles.get(tileID))
-            {
-                _mm_pause();
-                continue;
-            }
-
-            if (tile->tryLock())
-            {
-                BE_WORK* pWork;
-
-                RDTSC_BEGIN(pContext->pBucketMgr, WorkerFoundWork, pDC->drawId);
-
-                uint32_t numWorkItems = tile->getNumQueued();
-                SWR_ASSERT(numWorkItems);
-
-                pWork = tile->peek();
-                SWR_ASSERT(pWork);
-                if (pWork->type == DRAW)
-                {
-                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
-                }
-                else if (pWork->type == SHUTDOWN)
-                {
-                    bShutdown = true;
-                }
-
-                while ((pWork = tile->peek()) != nullptr)
-                {
-                    pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
-                    tile->dequeue();
-                }
-                RDTSC_END(pContext->pBucketMgr, WorkerFoundWork, numWorkItems);
-
-                _ReadWriteBarrier();
-
-                pDC->pTileMgr->markTileComplete(tileID);
-
-                // Optimization: If the draw is complete and we're the last one to have worked on it
-                // then we can reset the locked list as we know that all previous draws before the
-                // next are guaranteed to be complete.
-                if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
-                {
-                    // We can increment the current BE and safely move to next draw since we know
-                    // this draw is complete.
-                    curDrawBE++;
-                    CompleteDrawContextInl(pContext, workerId, pDC);
-
-                    lastRetiredDraw++;
-
-                    lockedTiles.clear();
-                    break;
-                }
-
-                if (bShutdown)
-                {
-                    break;
-                }
-            }
-            else
-            {
-                // This tile is already locked. So let's add it to our locked tiles set. This way we
-                // don't try locking this one again.
-                lockedTiles.set(tileID);
-                _mm_pause();
-            }
-        }
-    }
-
-    return bShutdown;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Called when FE work is complete for this DC.
-INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
-{
-    if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
-    {
-        SWR_STATS_FE& stats = pDC->dynState.statsFE;
-
-        AR_EVENT(FrontendStatsEvent(pDC->drawId,
-                                    stats.IaVertices,
-                                    stats.IaPrimitives,
-                                    stats.VsInvocations,
-                                    stats.HsInvocations,
-                                    stats.DsInvocations,
-                                    stats.GsInvocations,
-                                    stats.GsPrimitives,
-                                    stats.CInvocations,
-                                    stats.CPrimitives,
-                                    stats.SoPrimStorageNeeded[0],
-                                    stats.SoPrimStorageNeeded[1],
-                                    stats.SoPrimStorageNeeded[2],
-                                    stats.SoPrimStorageNeeded[3],
-                                    stats.SoNumPrimsWritten[0],
-                                    stats.SoNumPrimsWritten[1],
-                                    stats.SoNumPrimsWritten[2],
-                                    stats.SoNumPrimsWritten[3]));
-        AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
-
-        pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
-    }
-
-    if (pContext->pfnUpdateSoWriteOffset)
-    {
-        for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
-        {
-            if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
-                (pDC->pState->state.soBuffer[i].soWriteEnable))
-            {
-                pContext->pfnUpdateSoWriteOffset(
-                    GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
-            }
-        }
-    }
-
-    if (pContext->pfnUpdateStreamOut)
-        pContext->pfnUpdateStreamOut(GetPrivateState(pDC),  pDC->dynState.soPrims);
-
-    // Ensure all streaming writes are globally visible before marking this FE done
-    _mm_mfence();
-    pDC->doneFE = true;
-
-    InterlockedDecrement(&pContext->drawsOutstandingFE);
-}
-
-void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE)
-{
-    // Try to grab the next DC from the ring
-    uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
-    while (IDComparesLess(curDrawFE, drawEnqueued))
-    {
-        uint32_t      dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
-        DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
-        if (pDC->isCompute || pDC->doneFE)
-        {
-            CompleteDrawContextInl(pContext, workerId, pDC);
-            curDrawFE++;
-        }
-        else
-        {
-            break;
-        }
-    }
-
-    uint32_t lastRetiredFE = curDrawFE - 1;
-    uint32_t curDraw       = curDrawFE;
-    while (IDComparesLess(curDraw, drawEnqueued))
-    {
-        uint32_t      dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
-        DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
-
-        if (!pDC->FeLock && !pDC->isCompute)
-        {
-            if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
-            {
-                return;
-            }
-
-            uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
-            if (initial == 0)
-            {
-                // successfully grabbed the DC, now run the FE
-                pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
-
-                CompleteDrawFE(pContext, workerId, pDC);
-            }
-            else
-            {
-                _mm_pause();
-            }
-        }
-        else
-        {
-            _mm_pause();
-        }
-
-        curDraw++;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief If there is any compute work then go work on it.
-/// @param pContext - pointer to SWR context.
-/// @param workerId - The unique worker ID that is assigned to this thread.
-/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
-/// thread
-///                    has its own curDrawBE counter and this ensures that each worker processes all
-///                    the draws in order.
-void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE)
-{
-    uint32_t drawEnqueued = 0;
-    if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
-    {
-        return;
-    }
-
-    uint32_t lastRetiredDraw =
-        pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
-
-    for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
-    {
-        DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
-        if (pDC->isCompute == false)
-            return;
-
-        // check dependencies
-        if (CheckDependency(pContext, pDC, lastRetiredDraw))
-        {
-            return;
-        }
-
-        SWR_ASSERT(pDC->pDispatch != nullptr);
-        DispatchQueue& queue = *pDC->pDispatch;
-
-        // Is there any work remaining?
-        if (queue.getNumQueued() > 0)
-        {
-            void*    pSpillFillBuffer = nullptr;
-            void*    pScratchSpace    = nullptr;
-            uint32_t threadGroupId    = 0;
-            while (queue.getWork(threadGroupId))
-            {
-                queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
-                queue.finishedWork();
-            }
-
-            // Ensure all streaming writes are globally visible before moving onto the next draw
-            _mm_mfence();
-        }
-    }
-}
-
-void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId)
-{
-    if (nullptr == pContext)
-    {
-        return;
-    }
-
-    if (apiThreadId >= pContext->threadPool.numReservedThreads)
-    {
-        if (pContext->threadPool.numReservedThreads)
-        {
-            const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[0];
-            // Just bind to the process group used for API thread 0
-            bindThread(pContext, 0, threadData.procGroupId, true);
-        }
-        return;
-    }
-
-    const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[apiThreadId];
-
-    bindThread(
-        pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
-}
-
-template <bool IsFEThread, bool IsBEThread>
-DWORD workerThreadMain(LPVOID pData)
-{
-    THREAD_DATA* pThreadData = (THREAD_DATA*)pData;
-    SWR_CONTEXT* pContext    = pThreadData->pContext;
-    uint32_t     threadId    = pThreadData->threadId;
-    uint32_t     workerId    = pThreadData->workerId;
-
-    bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
-
-    {
-        char threadName[64];
-        sprintf_s(threadName,
-#if defined(_WIN32)
-                  "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
-#else
-                  // linux pthread name limited to 16 chars (including \0)
-                  "w%03d-n%d-c%03d-t%d",
-#endif
-                  workerId,
-                  pThreadData->numaId,
-                  pThreadData->coreId,
-                  pThreadData->htId);
-        SetCurrentThreadName(threadName);
-    }
-
-    RDTSC_INIT(pContext->pBucketMgr, threadId);
-
-    // Only need offset numa index from base for correct masking
-    uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
-    uint32_t numaMask = pContext->threadPool.numaMask;
-
-    SetOptimalVectorCSR();
-
-    // Track tiles locked by other threads. If we try to lock a macrotile and find its already
-    // locked then we'll add it to this list so that we don't try and lock it again.
-    TileSet lockedTiles;
-
-    // each worker has the ability to work on any of the queued draws as long as certain
-    // conditions are met. the data associated
-    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
-    // has moved on to the next draw when he determines there is no more work to do. The api
-    // thread will not increment the head of the dc ring until all workers have moved past the
-    // current head.
-    // the logic to determine what to work on is:
-    // 1- try to work on the FE any draw that is queued. For now there are no dependencies
-    //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
-    //    we'll need dependency tracking to force serialization on FEs.  The worker will try
-    //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
-    //    trying until he reaches the tail.
-    // 2- BE work must be done in strict order. we accomplish this today by pulling work off
-    //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
-    //    any work left by comparing the total # of binned work items and the total # of completed
-    //    work items. If they are equal, then there is no more work to do for this draw, and
-    //    the worker can safely increment its oldestDraw counter and move on to the next draw.
-    std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
-
-    auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
-
-    uint32_t curDrawBE = 0;
-    uint32_t curDrawFE = 0;
-
-    bool bShutdown = false;
-
-    while (true)
-    {
-        if (bShutdown && !threadHasWork(curDrawBE))
-        {
-            break;
-        }
-
-        uint32_t loop = 0;
-        while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
-        {
-            _mm_pause();
-        }
-
-        if (!threadHasWork(curDrawBE))
-        {
-            lock.lock();
-
-            // check for thread idle condition again under lock
-            if (threadHasWork(curDrawBE))
-            {
-                lock.unlock();
-                continue;
-            }
-
-            pContext->FifosNotEmpty.wait(lock);
-            lock.unlock();
-        }
-
-        if (IsBEThread)
-        {
-            RDTSC_BEGIN(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
-            bShutdown |=
-                WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
-            RDTSC_END(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
-
-            WorkOnCompute(pContext, workerId, curDrawBE);
-        }
-
-        if (IsFEThread)
-        {
-            WorkOnFifoFE(pContext, workerId, curDrawFE);
-
-            if (!IsBEThread)
-            {
-                curDrawBE = curDrawFE;
-            }
-        }
-    }
-
-    return 0;
-}
-template <>
-DWORD workerThreadMain<false, false>(LPVOID) = delete;
-
-template <bool IsFEThread, bool IsBEThread>
-DWORD workerThreadInit(LPVOID pData)
-{
-#if defined(_MSC_VER)
-    __try
-#endif // _WIN32
-    {
-        return workerThreadMain<IsFEThread, IsBEThread>(pData);
-    }
-
-#if defined(_MSC_VER)
-    __except (EXCEPTION_CONTINUE_SEARCH)
-    {
-    }
-
-#endif // _WIN32
-
-    return 1;
-}
-template <>
-DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
-
-static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
-{
-    // Initialize DRAW_CONTEXT's per-thread stats
-    for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
-    {
-        pContext->dcRing[dc].dynState.pStats =
-            (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
-        memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Creates thread pool info but doesn't launch threads.
-/// @param pContext - pointer to context
-/// @param pPool - pointer to thread pool object.
-void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
-{
-    CPUNumaNodes nodes;
-    uint32_t     numThreadsPerProcGroup = 0;
-    CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
-    assert(numThreadsPerProcGroup > 0);
-
-    // Assumption, for asymmetric topologies, multi-threaded cores will appear
-    // in the list before single-threaded cores.  This appears to be true for
-    // Windows when the total HW threads is limited to 64.
-    uint32_t numHWNodes        = (uint32_t)nodes.size();
-    uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
-    uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
-
-#if defined(_WIN32) && !defined(_WIN64)
-    if (!pContext->threadInfo.MAX_WORKER_THREADS)
-    {
-        // Limit 32-bit windows to bindable HW threads only
-        if ((numHWCoresPerNode * numHWHyperThreads) > 32)
-        {
-            numHWCoresPerNode = 32 / numHWHyperThreads;
-        }
-    }
-#endif
-
-    // Calculate num HW threads.  Due to asymmetric topologies, this is not
-    // a trivial multiplication.
-    uint32_t numHWThreads = 0;
-    for (auto const& node : nodes)
-    {
-        for (auto const& core : node.cores)
-        {
-            numHWThreads += (uint32_t)core.threadIds.size();
-        }
-    }
-
-    uint32_t numNodes        = numHWNodes;
-    uint32_t numCoresPerNode = numHWCoresPerNode;
-    uint32_t numHyperThreads = numHWHyperThreads;
-
-    // Calc used threads per-core
-    if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
-    {
-        numHyperThreads -= pContext->threadInfo.BASE_THREAD;
-    }
-    else
-    {
-        SWR_ASSERT(false,
-                   "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
-                   pContext->threadInfo.BASE_THREAD,
-                   numHyperThreads);
-        pContext->threadInfo.BASE_THREAD = 0;
-    }
-
-    if (pContext->threadInfo.MAX_THREADS_PER_CORE)
-    {
-        numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
-    }
-
-    // Prune any cores that don't support the number of threads
-    if (numHyperThreads > 1)
-    {
-        for (auto& node : nodes)
-        {
-            uint32_t numUsableCores = 0;
-            for (auto& core : node.cores)
-            {
-                numUsableCores += (core.threadIds.size() >= numHyperThreads);
-            }
-            numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
-        }
-    }
-
-    // Calc used cores per NUMA node
-    if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
-    {
-        numCoresPerNode -= pContext->threadInfo.BASE_CORE;
-    }
-    else
-    {
-        SWR_ASSERT(false,
-                   "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
-                   pContext->threadInfo.BASE_CORE,
-                   numCoresPerNode);
-        pContext->threadInfo.BASE_CORE = 0;
-    }
-
-    if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
-    {
-        numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
-    }
-
-    // Calc used NUMA nodes
-    if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
-    {
-        numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
-    }
-    else
-    {
-        SWR_ASSERT(
-            false,
-            "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
-            pContext->threadInfo.BASE_NUMA_NODE,
-            numNodes);
-        pContext->threadInfo.BASE_NUMA_NODE = 0;
-    }
-
-    if (pContext->threadInfo.MAX_NUMA_NODES)
-    {
-        numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
-    }
-
-    // Calculate numThreads - at this point everything should be symmetric
-    uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
-    SWR_REL_ASSERT(numThreads <= numHWThreads);
-
-    uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
-    uint32_t& numAPIThreadsPerCore  = pContext->apiThreadInfo.numAPIThreadsPerCore;
-    uint32_t  numRemovedThreads     = 0;
-
-    if (pContext->threadInfo.SINGLE_THREADED)
-    {
-        numAPIReservedThreads      = 0;
-        numThreads                 = 1;
-        pContext->NumWorkerThreads = 1;
-        pContext->NumFEThreads     = 1;
-        pContext->NumBEThreads     = 1;
-        pPool->numThreads          = 0;
-    }
-    else if (pContext->threadInfo.MAX_WORKER_THREADS)
-    {
-        numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
-        pContext->threadInfo.BASE_NUMA_NODE = 0;
-        pContext->threadInfo.BASE_CORE      = 0;
-        pContext->threadInfo.BASE_THREAD    = 0;
-        numAPIReservedThreads               = 0;
-    }
-    else
-    {
-        if (numAPIReservedThreads >= numThreads)
-        {
-            numAPIReservedThreads = 0;
-        }
-        else if (numAPIReservedThreads)
-        {
-            numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
-
-            if (0 == numAPIThreadsPerCore)
-            {
-                numAPIThreadsPerCore = numHWHyperThreads;
-            }
-
-            numRemovedThreads = numAPIReservedThreads;
-            if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
-            {
-                // Adjust removed threads to make logic below work
-                numRemovedThreads =
-                    std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
-            }
-
-            numThreads -= numRemovedThreads;
-        }
-    }
-
-    InitPerThreadStats(pContext, numThreads);
-
-    if (pContext->threadInfo.SINGLE_THREADED)
-    {
-        numAPIReservedThreads = 0;
-        numThreads            = 1;
-    }
-
-    if (numAPIReservedThreads)
-    {
-        pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
-        SWR_ASSERT(pPool->pApiThreadData);
-        if (!pPool->pApiThreadData)
-        {
-            numAPIReservedThreads = 0;
-        }
-        else
-        {
-            memset(pPool->pApiThreadData, 0, sizeof(THREAD_DATA) * numAPIReservedThreads);
-        }
-    }
-    pPool->numReservedThreads = numAPIReservedThreads;
-
-    pPool->numThreads          = numThreads;
-    pContext->NumWorkerThreads = pPool->numThreads;
-
-    pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
-    assert(pPool->pThreadData);
-    memset(pPool->pThreadData, 0, sizeof(THREAD_DATA) * pPool->numThreads);
-    pPool->numaMask = 0;
-
-    // Allocate worker private data
-    pPool->pWorkerPrivateDataArray = nullptr;
-    if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
-    {
-        pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
-        pContext->workerPrivateState.pfnInitWorkerData = nullptr;
-        pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
-    }
-
-    // initialize contents of SWR_WORKER_DATA
-    size_t perWorkerSize =
-        AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
-    size_t totalSize = perWorkerSize * pPool->numThreads;
-    if (totalSize)
-    {
-        pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
-        SWR_ASSERT(pPool->pWorkerPrivateDataArray);
-
-        void* pWorkerData = pPool->pWorkerPrivateDataArray;
-        for (uint32_t i = 0; i < pPool->numThreads; ++i)
-        {
-            pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
-            if (pContext->workerPrivateState.pfnInitWorkerData)
-            {
-                pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i);
-            }
-            pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
-        }
-    }
-
-    if (pContext->threadInfo.SINGLE_THREADED)
-    {
-        return;
-    }
-
-    pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
-    assert(pPool->pThreads);
-
-    if (pContext->threadInfo.MAX_WORKER_THREADS)
-    {
-        bool     bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
-        uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
-        // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
-        // But Windows will still require binding to specific process groups
-        for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
-        {
-            pPool->pThreadData[workerId].workerId           = workerId;
-            pPool->pThreadData[workerId].procGroupId        = workerId % numProcGroups;
-            pPool->pThreadData[workerId].threadId           = 0;
-            pPool->pThreadData[workerId].numaId             = 0;
-            pPool->pThreadData[workerId].coreId             = 0;
-            pPool->pThreadData[workerId].htId               = 0;
-            pPool->pThreadData[workerId].pContext           = pContext;
-            pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
-
-            pContext->NumBEThreads++;
-            pContext->NumFEThreads++;
-        }
-    }
-    else
-    {
-        // numa distribution assumes workers on all nodes
-        bool useNuma = true;
-        if (numCoresPerNode * numHyperThreads == 1)
-        {
-            useNuma = false;
-        }
-
-        if (useNuma)
-        {
-            pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
-        }
-        else
-        {
-            pPool->numaMask = 0;
-        }
-
-        uint32_t workerId           = 0;
-        uint32_t numReservedThreads = numAPIReservedThreads;
-        for (uint32_t n = 0; n < numNodes; ++n)
-        {
-            if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
-            {
-                break;
-            }
-            auto&    node     = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
-            uint32_t numCores = numCoresPerNode;
-            for (uint32_t c = 0; c < numCores; ++c)
-            {
-                if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
-                {
-                    break;
-                }
-
-                auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
-                for (uint32_t t = 0; t < numHyperThreads; ++t)
-                {
-                    if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
-                    {
-                        break;
-                    }
-
-                    if (numRemovedThreads)
-                    {
-                        --numRemovedThreads;
-                        assert(numReservedThreads);
-                        --numReservedThreads;
-                        pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
-                        pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
-                        pPool->pApiThreadData[numReservedThreads].threadId    = core.threadIds[t];
-                        pPool->pApiThreadData[numReservedThreads].numaId =
-                            useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
-                        pPool->pApiThreadData[numReservedThreads].coreId =
-                            c + pContext->threadInfo.BASE_CORE;
-                        pPool->pApiThreadData[numReservedThreads].htId =
-                            t + pContext->threadInfo.BASE_THREAD;
-                        pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
-                        pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
-
-                        if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
-                        {
-                            --numReservedThreads;
-                            pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
-                            pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
-                            pPool->pApiThreadData[numReservedThreads].threadId =
-                                core.threadIds[t + 1];
-                            pPool->pApiThreadData[numReservedThreads].numaId =
-                                useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
-                            pPool->pApiThreadData[numReservedThreads].coreId =
-                                c + pContext->threadInfo.BASE_CORE;
-                            pPool->pApiThreadData[numReservedThreads].htId =
-                                t + pContext->threadInfo.BASE_THREAD;
-                            pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
-                            pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
-                        }
-
-                        continue;
-                    }
-
-                    SWR_ASSERT(workerId < numThreads);
-
-                    pPool->pThreadData[workerId].workerId    = workerId;
-                    pPool->pThreadData[workerId].procGroupId = core.procGroup;
-                    pPool->pThreadData[workerId].threadId =
-                        core.threadIds[t + pContext->threadInfo.BASE_THREAD];
-                    pPool->pThreadData[workerId].numaId =
-                        useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
-                    pPool->pThreadData[workerId].coreId   = c + pContext->threadInfo.BASE_CORE;
-                    pPool->pThreadData[workerId].htId     = t + pContext->threadInfo.BASE_THREAD;
-                    pPool->pThreadData[workerId].pContext = pContext;
-                    pPool->pThreadData[workerId].forceBindProcGroup = false;
-
-                    pContext->NumBEThreads++;
-                    pContext->NumFEThreads++;
-
-                    ++workerId;
-                }
-            }
-        }
-        SWR_ASSERT(workerId == pContext->NumWorkerThreads);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Launches worker threads in thread pool.
-/// @param pContext - pointer to context
-/// @param pPool - pointer to thread pool object.
-void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
-{
-    if (pContext->threadInfo.SINGLE_THREADED)
-    {
-        return;
-    }
-
-    for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
-    {
-        pPool->pThreads[workerId] =
-            new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Destroys thread pool.
-/// @param pContext - pointer to context
-/// @param pPool - pointer to thread pool object.
-void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
-{
-    // Wait for all threads to finish
-    SwrWaitForIdle(pContext);
-
-    // Wait for threads to finish and destroy them
-    for (uint32_t t = 0; t < pPool->numThreads; ++t)
-    {
-        if (!pContext->threadInfo.SINGLE_THREADED)
-        {
-            // Detach from thread.  Cannot join() due to possibility (in Windows) of code
-            // in some DLLMain(THREAD_DETACH case) blocking the thread until after this returns.
-            pPool->pThreads[t]->detach();
-            delete (pPool->pThreads[t]);
-        }
-
-        if (pContext->workerPrivateState.pfnFinishWorkerData)
-        {
-            pContext->workerPrivateState.pfnFinishWorkerData(
-                pContext, pPool->pThreadData[t].pWorkerPrivateData, t);
-        }
-    }
-
-    delete[] pPool->pThreads;
-
-    // Clean up data used by threads
-    delete[] pPool->pThreadData;
-    delete[] pPool->pApiThreadData;
-
-    AlignedFree(pPool->pWorkerPrivateDataArray);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
deleted file mode 100644
index 3072bbc835d..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file threads.h
- *
- * @brief Definitions for SWR threading model.
- *
- ******************************************************************************/
-#pragma once
-
-#include "knobs.h"
-
-#include <unordered_set>
-#include <thread>
-typedef std::thread* THREAD_PTR;
-
-struct SWR_CONTEXT;
-struct DRAW_CONTEXT;
-struct SWR_WORKER_PRIVATE_STATE;
-
-struct THREAD_DATA
-{
-    void*        pWorkerPrivateData; // Pointer to per-worker private data
-    uint32_t     procGroupId;        // Will always be 0 for non-Windows OS
-    uint32_t     threadId;           // within the procGroup for Windows
-    uint32_t     numaId;             // NUMA node id
-    uint32_t     coreId;             // Core id
-    uint32_t     htId;               // Hyperthread id
-    uint32_t     workerId;           // index of worker in total thread data
-    void*        clipperData;        // pointer to hang clipper-private data on
-    SWR_CONTEXT* pContext;
-    bool         forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set.
-};
-
-struct THREAD_POOL
-{
-    THREAD_PTR*  pThreads;
-    uint32_t     numThreads;
-    uint32_t     numaMask;
-    THREAD_DATA* pThreadData;
-    void*        pWorkerPrivateDataArray; // All memory for worker private data
-    uint32_t     numReservedThreads;      // Number of threads reserved for API use
-    THREAD_DATA* pApiThreadData;
-};
-
-struct TileSet;
-
-void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
-void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
-void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
-
-// Expose FE and BE worker functions to the API thread if single threaded
-void    WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE);
-bool    WorkOnFifoBE(SWR_CONTEXT* pContext,
-                     uint32_t     workerId,
-                     uint32_t&    curDrawBE,
-                     TileSet&     usedTiles,
-                     uint32_t     numaNode,
-                     uint32_t     numaMask);
-void    WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE);
-int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
-
-void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
deleted file mode 100644
index a02fa336277..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ /dev/null
@@ -1,454 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file tilemgr.cpp
- *
- * @brief Implementation for Macro Tile Manager which provides the facilities
- *        for threads to work on an macro tile.
- *
- ******************************************************************************/
-#include <unordered_map>
-
-#include "fifo.hpp"
-#include "core/tilemgr.h"
-#include "core/multisample.h"
-#include "rdtsc_core.h"
-
-MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) {}
-
-void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK* pWork)
-{
-    // Should not enqueue more then what we have backing for in the hot tile manager.
-    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
-    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
-
-    if ((x & ~(KNOB_NUM_HOT_TILES_X - 1)) | (y & ~(KNOB_NUM_HOT_TILES_Y - 1)))
-    {
-        return;
-    }
-
-    uint32_t id = getTileId(x, y);
-
-    if (id >= mTiles.size())
-    {
-        mTiles.resize((16 + id) * 2);
-    }
-
-    MacroTileQueue* pTile = mTiles[id];
-    if (!pTile)
-    {
-        pTile = mTiles[id] = new MacroTileQueue();
-    }
-    pTile->mWorkItemsFE++;
-    pTile->mId = id;
-
-    if (pTile->mWorkItemsFE == 1)
-    {
-        pTile->clear(mArena);
-        mDirtyTiles.push_back(pTile);
-    }
-
-    mWorkItemsProduced++;
-    pTile->enqueue_try_nosync(mArena, pWork);
-}
-
-void MacroTileMgr::markTileComplete(uint32_t id)
-{
-    SWR_ASSERT(mTiles.size() > id);
-    MacroTileQueue& tile     = *mTiles[id];
-    uint32_t        numTiles = tile.mWorkItemsFE;
-    InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
-
-    _ReadWriteBarrier();
-    tile.mWorkItemsBE += numTiles;
-    SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE);
-
-    // clear out tile, but defer fifo clear until the next DC first queues to it.
-    // this prevents worker threads from constantly locking a completed macro tile
-    tile.mWorkItemsFE = 0;
-    tile.mWorkItemsBE = 0;
-}
-
-HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT*                pContext,
-                                DRAW_CONTEXT*               pDC,
-                                HANDLE                      hWorkerPrivateData,
-                                uint32_t                    macroID,
-                                SWR_RENDERTARGET_ATTACHMENT attachment,
-                                bool                        create,
-                                uint32_t                    numSamples,
-                                uint32_t                    renderTargetArrayIndex)
-{
-    uint32_t x, y;
-    MacroTileMgr::getTileIndices(macroID, x, y);
-
-    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
-    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
-
-    HotTileSet& tile    = mHotTiles[x][y];
-    HOTTILE&    hotTile = tile.Attachment[attachment];
-    if (hotTile.pBuffer == NULL)
-    {
-        if (create)
-        {
-            uint32_t size     = numSamples * mHotTileSize[attachment];
-            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer =
-                (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
-            hotTile.state                  = HOTTILE_INVALID;
-            hotTile.numSamples             = numSamples;
-            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-        }
-        else
-        {
-            return NULL;
-        }
-    }
-    else
-    {
-        // free the old tile and create a new one with enough space to hold all samples
-        if (numSamples > hotTile.numSamples)
-        {
-            // tile should be either uninitialized or resolved if we're deleting and switching to a
-            // new sample count
-            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) ||
-                       (hotTile.state == HOTTILE_CLEAR));
-            FreeHotTileMem(hotTile.pBuffer);
-
-            uint32_t size     = numSamples * mHotTileSize[attachment];
-            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer =
-                (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
-            hotTile.state      = HOTTILE_INVALID;
-            hotTile.numSamples = numSamples;
-        }
-
-        // if requested render target array index isn't currently loaded, need to store out the
-        // current hottile and load the requested array slice
-        if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
-        {
-            SWR_FORMAT format;
-            switch (attachment)
-            {
-            case SWR_ATTACHMENT_COLOR0:
-            case SWR_ATTACHMENT_COLOR1:
-            case SWR_ATTACHMENT_COLOR2:
-            case SWR_ATTACHMENT_COLOR3:
-            case SWR_ATTACHMENT_COLOR4:
-            case SWR_ATTACHMENT_COLOR5:
-            case SWR_ATTACHMENT_COLOR6:
-            case SWR_ATTACHMENT_COLOR7:
-                format = KNOB_COLOR_HOT_TILE_FORMAT;
-                break;
-            case SWR_ATTACHMENT_DEPTH:
-                format = KNOB_DEPTH_HOT_TILE_FORMAT;
-                break;
-            case SWR_ATTACHMENT_STENCIL:
-                format = KNOB_STENCIL_HOT_TILE_FORMAT;
-                break;
-            default:
-                SWR_INVALID("Unknown attachment: %d", attachment);
-                format = KNOB_COLOR_HOT_TILE_FORMAT;
-                break;
-            }
-
-            if (hotTile.state == HOTTILE_CLEAR)
-            {
-                if (attachment == SWR_ATTACHMENT_STENCIL)
-                    ClearStencilHotTile(&hotTile);
-                else if (attachment == SWR_ATTACHMENT_DEPTH)
-                    ClearDepthHotTile(&hotTile);
-                else
-                    ClearColorHotTile(&hotTile);
-
-                hotTile.state = HOTTILE_DIRTY;
-            }
-
-            if (hotTile.state == HOTTILE_DIRTY)
-            {
-                pContext->pfnStoreTile(pDC,
-                                       hWorkerPrivateData,
-                                       format,
-                                       attachment,
-                                       x * KNOB_MACROTILE_X_DIM,
-                                       y * KNOB_MACROTILE_Y_DIM,
-                                       hotTile.renderTargetArrayIndex,
-                                       hotTile.pBuffer);
-            }
-
-            pContext->pfnLoadTile(pDC,
-                                  hWorkerPrivateData,
-                                  format,
-                                  attachment,
-                                  x * KNOB_MACROTILE_X_DIM,
-                                  y * KNOB_MACROTILE_Y_DIM,
-                                  renderTargetArrayIndex,
-                                  hotTile.pBuffer);
-
-            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-            hotTile.state = HOTTILE_RESOLVED;
-        }
-    }
-    return &tile.Attachment[attachment];
-}
-
-HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT*                pContext,
-                                      DRAW_CONTEXT*               pDC,
-                                      uint32_t                    macroID,
-                                      SWR_RENDERTARGET_ATTACHMENT attachment,
-                                      bool                        create,
-                                      uint32_t                    numSamples)
-{
-    uint32_t x, y;
-    MacroTileMgr::getTileIndices(macroID, x, y);
-
-    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
-    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
-
-    HotTileSet& tile    = mHotTiles[x][y];
-    HOTTILE&    hotTile = tile.Attachment[attachment];
-    if (hotTile.pBuffer == NULL)
-    {
-        if (create)
-        {
-            uint32_t size                  = numSamples * mHotTileSize[attachment];
-            hotTile.pBuffer                = (uint8_t*)AlignedMalloc(size, 64);
-            hotTile.state                  = HOTTILE_INVALID;
-            hotTile.numSamples             = numSamples;
-            hotTile.renderTargetArrayIndex = 0;
-        }
-        else
-        {
-            return NULL;
-        }
-    }
-
-    return &hotTile;
-}
-
-void HotTileMgr::ClearColorHotTile(
-    const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
-{
-    // Load clear color into SIMD register...
-    float*       pClearData = (float*)(pHotTile->clearData);
-    simd16scalar valR       = _simd16_broadcast_ss(&pClearData[0]);
-    simd16scalar valG       = _simd16_broadcast_ss(&pClearData[1]);
-    simd16scalar valB       = _simd16_broadcast_ss(&pClearData[2]);
-    simd16scalar valA       = _simd16_broadcast_ss(&pClearData[3]);
-
-    float*   pfBuf      = (float*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
-                 si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
-            {
-                _simd16_store_ps(pfBuf, valR);
-                pfBuf += KNOB_SIMD16_WIDTH;
-
-                _simd16_store_ps(pfBuf, valG);
-                pfBuf += KNOB_SIMD16_WIDTH;
-
-                _simd16_store_ps(pfBuf, valB);
-                pfBuf += KNOB_SIMD16_WIDTH;
-
-                _simd16_store_ps(pfBuf, valA);
-                pfBuf += KNOB_SIMD16_WIDTH;
-            }
-        }
-    }
-}
-
-void HotTileMgr::ClearDepthHotTile(
-    const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
-{
-    // Load clear color into SIMD register...
-    float*       pClearData = (float*)(pHotTile->clearData);
-    simd16scalar valZ       = _simd16_broadcast_ss(&pClearData[0]);
-
-    float*   pfBuf      = (float*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
-                 si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
-            {
-                _simd16_store_ps(pfBuf, valZ);
-                pfBuf += KNOB_SIMD16_WIDTH;
-            }
-        }
-    }
-}
-
-void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
-{
-    // convert from F32 to U8.
-    uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
-    // broadcast 32x into __m256i...
-    simd16scalari valS = _simd16_set1_epi8(clearVal);
-
-    simd16scalari* pBuf       = (simd16scalari*)pHotTile->pBuffer;
-    uint32_t       numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
-                 si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4)
-            {
-                _simd16_store_si(pBuf, valS);
-                pBuf += 1;
-            }
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief InitializeHotTiles
-/// for draw calls, we initialize the active hot tiles and perform deferred
-/// load on them if tile is in invalid state. we do this in the outer thread
-/// loop instead of inside the draw routine itself mainly for performance,
-/// to avoid unnecessary setup every triangle
-/// @todo support deferred clear
-/// @param pCreateInfo - pointer to creation info.
-void HotTileMgr::InitializeHotTiles(SWR_CONTEXT*  pContext,
-                                    DRAW_CONTEXT* pDC,
-                                    uint32_t      workerId,
-                                    uint32_t      macroID)
-{
-    const API_STATE& state    = GetApiState(pDC);
-    HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    uint32_t x, y;
-    MacroTileMgr::getTileIndices(macroID, x, y);
-    x *= KNOB_MACROTILE_X_DIM;
-    y *= KNOB_MACROTILE_Y_DIM;
-
-    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
-
-    // check RT if enabled
-    unsigned long rtSlot                 = 0;
-    uint32_t      colorHottileEnableMask = state.colorHottileEnable;
-    while (_BitScanForward(&rtSlot, colorHottileEnableMask))
-    {
-        HOTTILE* pHotTile =
-            GetHotTile(pContext,
-                       pDC,
-                       hWorkerPrivateData,
-                       macroID,
-                       (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
-                       true,
-                       numSamples);
-
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(pDC,
-                                  hWorkerPrivateData,
-                                  KNOB_COLOR_HOT_TILE_FORMAT,
-                                  (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
-                                  x,
-                                  y,
-                                  pHotTile->renderTargetArrayIndex,
-                                  pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_RESOLVED;
-            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
-            // Clear the tile.
-            ClearColorHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
-        }
-        colorHottileEnableMask &= ~(1 << rtSlot);
-    }
-
-    // check depth if enabled
-    if (state.depthHottileEnable)
-    {
-        HOTTILE* pHotTile = GetHotTile(
-            pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(pDC,
-                                  hWorkerPrivateData,
-                                  KNOB_DEPTH_HOT_TILE_FORMAT,
-                                  SWR_ATTACHMENT_DEPTH,
-                                  x,
-                                  y,
-                                  pHotTile->renderTargetArrayIndex,
-                                  pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
-            // Clear the tile.
-            ClearDepthHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
-        }
-    }
-
-    // check stencil if enabled
-    if (state.stencilHottileEnable)
-    {
-        HOTTILE* pHotTile = GetHotTile(
-            pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(pDC,
-                                  hWorkerPrivateData,
-                                  KNOB_STENCIL_HOT_TILE_FORMAT,
-                                  SWR_ATTACHMENT_STENCIL,
-                                  x,
-                                  y,
-                                  pHotTile->renderTargetArrayIndex,
-                                  pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
-            // Clear the tile.
-            ClearStencilHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
-        }
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
deleted file mode 100644
index fb8a4a14881..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ /dev/null
@@ -1,354 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file tilemgr.h
- *
- * @brief Definitions for Macro Tile Manager which provides the facilities
- *        for threads to work on an macro tile.
- *
- ******************************************************************************/
-#pragma once
-
-#include <set>
-#include <unordered_map>
-#include "common/formats.h"
-#include "common/intrin.h"
-#include "fifo.hpp"
-#include "context.h"
-#include "format_traits.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// MacroTile - work queue for a tile.
-//////////////////////////////////////////////////////////////////////////
-struct MacroTileQueue
-{
-    MacroTileQueue() {}
-    ~MacroTileQueue() { destroy(); }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Returns number of work items queued for this tile.
-    uint32_t getNumQueued() { return mFifo.getNumQueued(); }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Attempt to lock the work fifo. If already locked then return false.
-    bool tryLock() { return mFifo.tryLock(); }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Clear fifo and unlock it.
-    template <typename ArenaT>
-    void clear(ArenaT& arena)
-    {
-        mFifo.clear(arena);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Peek at work sitting at the front of the fifo.
-    BE_WORK* peek() { return mFifo.peek(); }
-
-    template <typename ArenaT>
-    bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry)
-    {
-        return mFifo.enqueue_try_nosync(arena, entry);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Move to next work item
-    void dequeue() { mFifo.dequeue_noinc(); }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Destroy fifo
-    void destroy() { mFifo.destroy(); }
-
-    ///@todo This will all be private.
-    uint32_t mWorkItemsFE = 0;
-    uint32_t mWorkItemsBE = 0;
-    uint32_t mId          = 0;
-
-private:
-    QUEUE<BE_WORK> mFifo;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// MacroTileMgr - Manages macrotiles for a draw.
-//////////////////////////////////////////////////////////////////////////
-class MacroTileMgr
-{
-public:
-    MacroTileMgr(CachingArena& arena);
-    ~MacroTileMgr()
-    {
-        for (auto* pTile : mTiles)
-        {
-            delete pTile;
-        }
-    }
-
-    INLINE void initialize()
-    {
-        mWorkItemsProduced = 0;
-        mWorkItemsConsumed = 0;
-
-        mDirtyTiles.clear();
-    }
-
-    INLINE std::vector<MacroTileQueue*>& getDirtyTiles() { return mDirtyTiles; }
-    void                                 markTileComplete(uint32_t id);
-
-    INLINE bool isWorkComplete() { return mWorkItemsProduced == mWorkItemsConsumed; }
-
-    void enqueue(uint32_t x, uint32_t y, BE_WORK* pWork);
-
-    static INLINE void getTileIndices(uint32_t tileID, uint32_t& x, uint32_t& y)
-    {
-        // Morton / Z order of tiles
-        x = pext_u32(tileID, 0x55555555);
-        y = pext_u32(tileID, 0xAAAAAAAA);
-    }
-
-    static INLINE uint32_t getTileId(uint32_t x, uint32_t y)
-    {
-        // Morton / Z order of tiles
-        return pdep_u32(x, 0x55555555) | pdep_u32(y, 0xAAAAAAAA);
-    }
-
-private:
-    CachingArena&                mArena;
-    std::vector<MacroTileQueue*> mTiles;
-
-    // Any tile that has work queued to it is a dirty tile.
-    std::vector<MacroTileQueue*> mDirtyTiles;
-
-    OSALIGNLINE(long) mWorkItemsProduced{0};
-    OSALIGNLINE(volatile long) mWorkItemsConsumed{0};
-};
-
-typedef void (*PFN_DISPATCH)(DRAW_CONTEXT* pDC,
-                             uint32_t      workerId,
-                             uint32_t      threadGroupId,
-                             void*&        pSpillFillBuffer,
-                             void*&        pScratchSpace);
-
-//////////////////////////////////////////////////////////////////////////
-/// DispatchQueue - work queue for dispatch
-//////////////////////////////////////////////////////////////////////////
-class DispatchQueue
-{
-public:
-    DispatchQueue() {}
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Setup the producer consumer counts.
-    void initialize(uint32_t totalTasks, void* pTaskData, PFN_DISPATCH pfnDispatch)
-    {
-        // The available and outstanding counts start with total tasks.
-        // At the start there are N tasks available and outstanding.
-        // When both the available and outstanding counts have reached 0 then all work has
-        // completed. When a worker starts on a threadgroup then it decrements the available count.
-        // When a worker completes a threadgroup then it decrements the outstanding count.
-
-        mTasksAvailable   = totalTasks;
-        mTasksOutstanding = totalTasks;
-
-        mpTaskData   = pTaskData;
-        mPfnDispatch = pfnDispatch;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Returns number of tasks available for this dispatch.
-    uint32_t getNumQueued() { return (mTasksAvailable > 0) ? mTasksAvailable : 0; }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Atomically decrement the work available count. If the result
-    //         is greater than 0 then we can on the associated thread group.
-    //         Otherwise, there is no more work to do.
-    bool getWork(uint32_t& groupId)
-    {
-        long result = InterlockedDecrement(&mTasksAvailable);
-
-        if (result >= 0)
-        {
-            groupId = result;
-            return true;
-        }
-
-        return false;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Atomically decrement the outstanding count. A worker is notifying
-    ///        us that he just finished some work. Also, return true if we're
-    ///        the last worker to complete this dispatch.
-    bool finishedWork()
-    {
-        long result = InterlockedDecrement(&mTasksOutstanding);
-        SWR_ASSERT(result >= 0, "Should never oversubscribe work");
-
-        return (result == 0) ? true : false;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Work is complete once both the available/outstanding counts have reached 0.
-    bool isWorkComplete() { return ((mTasksAvailable <= 0) && (mTasksOutstanding <= 0)); }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Return pointer to task data.
-    const void* GetTasksData() { return mpTaskData; }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Dispatches a unit of work
-    void dispatch(DRAW_CONTEXT* pDC,
-                  uint32_t      workerId,
-                  uint32_t      threadGroupId,
-                  void*&        pSpillFillBuffer,
-                  void*&        pScratchSpace)
-    {
-        SWR_ASSERT(mPfnDispatch != nullptr);
-        mPfnDispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
-    }
-
-    void* mpTaskData{nullptr}; // The API thread will set this up and the callback task function
-                               // will interpet this.
-    PFN_DISPATCH mPfnDispatch{nullptr}; // Function to call per dispatch
-
-    OSALIGNLINE(volatile long) mTasksAvailable{0};
-    OSALIGNLINE(volatile long) mTasksOutstanding{0};
-};
-
-/// @note this enum needs to be kept in sync with SWR_TILE_STATE!
-enum HOTTILE_STATE
-{
-    HOTTILE_INVALID,  // tile is in uninitialized state and should be loaded with surface contents
-                      // before rendering
-    HOTTILE_CLEAR,    // tile should be cleared
-    HOTTILE_DIRTY,    // tile has been rendered to
-    HOTTILE_RESOLVED, // tile is consistent with memory (either loaded or stored)
-};
-
-struct HOTTILE
-{
-    uint8_t*      pBuffer;
-    HOTTILE_STATE state;
-    uint32_t clearData[4]; // May need to change based on pfnClearTile implementation.  Reorder for
-                        // alignment?
-    uint32_t numSamples;
-    uint32_t renderTargetArrayIndex; // current render target array index loaded
-};
-
-union HotTileSet
-{
-    struct
-    {
-        HOTTILE Color[SWR_NUM_RENDERTARGETS];
-        HOTTILE Depth;
-        HOTTILE Stencil;
-    };
-    HOTTILE Attachment[SWR_NUM_ATTACHMENTS];
-};
-
-class HotTileMgr
-{
-public:
-    HotTileMgr()
-    {
-        memset(mHotTiles, 0, sizeof(mHotTiles));
-
-        // cache hottile size
-        for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i)
-        {
-            mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM *
-                              FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
-        }
-        mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM *
-                                             FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
-        mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM *
-                                               FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
-    }
-
-    ~HotTileMgr()
-    {
-        for (int x = 0; x < KNOB_NUM_HOT_TILES_X; ++x)
-        {
-            for (int y = 0; y < KNOB_NUM_HOT_TILES_Y; ++y)
-            {
-                for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
-                {
-                    FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer);
-                }
-            }
-        }
-    }
-
-    void InitializeHotTiles(SWR_CONTEXT*  pContext,
-                            DRAW_CONTEXT* pDC,
-                            uint32_t      workerId,
-                            uint32_t      macroID);
-
-    HOTTILE* GetHotTile(SWR_CONTEXT*                pContext,
-                        DRAW_CONTEXT*               pDC,
-                        HANDLE                      hWorkerData,
-                        uint32_t                    macroID,
-                        SWR_RENDERTARGET_ATTACHMENT attachment,
-                        bool                        create,
-                        uint32_t                    numSamples             = 1,
-                        uint32_t                    renderTargetArrayIndex = 0);
-
-    HOTTILE* GetHotTileNoLoad(SWR_CONTEXT*                pContext,
-                              DRAW_CONTEXT*               pDC,
-                              uint32_t                    macroID,
-                              SWR_RENDERTARGET_ATTACHMENT attachment,
-                              bool                        create,
-                              uint32_t                    numSamples = 1);
-
-    static void ClearColorHotTile(const HOTTILE* pHotTile);
-    static void ClearDepthHotTile(const HOTTILE* pHotTile);
-    static void ClearStencilHotTile(const HOTTILE* pHotTile);
-
-private:
-    HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
-    uint32_t   mHotTileSize[SWR_NUM_ATTACHMENTS];
-
-    void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode)
-    {
-        void* p = nullptr;
-#if defined(_WIN32)
-        HANDLE hProcess = GetCurrentProcess();
-        p               = VirtualAllocExNuma(
-            hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
-#else
-        p = AlignedMalloc(size, align);
-#endif
-
-        return p;
-    }
-
-    void FreeHotTileMem(void* pBuffer)
-    {
-        if (pBuffer)
-        {
-#if defined(_WIN32)
-            VirtualFree(pBuffer, 0, MEM_RELEASE);
-#else
-            AlignedFree(pBuffer);
-#endif
-        }
-    }
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/tileset.h b/src/gallium/drivers/swr/rasterizer/core/tileset.h
deleted file mode 100644
index e28c84d789f..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/tileset.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file tileset.h
- *
- * @brief Custom bitset class for managing locked tiles
- *
- ******************************************************************************/
-#pragma once
-
-struct TileSet
-{
-    ~TileSet()
-    {
-        if (m_bits)
-        {
-            AlignedFree(m_bits);
-        }
-    }
-    INLINE void set(size_t idx)
-    {
-        _grow(idx);
-        size_t& word = _get_word(idx);
-        word |= (size_t(1) << (idx & BITS_OFFSET));
-        m_maxSet = std::max(m_maxSet, idx + 1);
-    }
-    INLINE bool get(size_t idx)
-    {
-        if (idx >= m_size)
-        {
-            return false;
-        }
-        size_t word = _get_word(idx);
-        return 0 != (word & (size_t(1) << (idx & BITS_OFFSET)));
-    }
-
-    INLINE void clear()
-    {
-        if (m_maxSet)
-        {
-            size_t num_words = (m_maxSet + BITS_OFFSET) / BITS_PER_WORD;
-            memset(m_bits, 0, sizeof(size_t) * num_words);
-            m_maxSet = 0;
-        }
-    }
-
-private:
-    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
-    static const size_t BITS_OFFSET   = BITS_PER_WORD - 1;
-
-    size_t  m_size   = 0;
-    size_t  m_maxSet = 0;
-    size_t* m_bits   = nullptr;
-
-    INLINE size_t& _get_word(size_t idx) { return m_bits[idx / BITS_PER_WORD]; }
-
-    void _grow(size_t idx)
-    {
-        if (idx < m_size)
-        {
-            return;
-        }
-
-        size_t  new_size   = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET;
-        size_t  num_words  = new_size / BITS_PER_WORD;
-        size_t* newBits    = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64);
-        size_t  copy_words = 0;
-
-        if (m_bits)
-        {
-            copy_words = (m_size + BITS_OFFSET) / BITS_PER_WORD;
-            num_words -= copy_words;
-            memcpy(newBits, m_bits, copy_words * sizeof(size_t));
-
-            AlignedFree(m_bits);
-        }
-
-        m_bits = newBits;
-        m_size = new_size;
-
-        memset(&m_bits[copy_words], 0, sizeof(size_t) * num_words);
-    }
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
deleted file mode 100644
index 9b483776be9..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ /dev/null
@@ -1,392 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file utils.h
- *
- * @brief Utilities used by SWR core.
- *
- ******************************************************************************/
-#pragma once
-
-#include <string.h>
-#include <type_traits>
-#include <algorithm>
-#include <array>
-#include "common/os.h"
-#include "common/intrin.h"
-#include "common/swr_assert.h"
-#include "core/api.h"
-
-struct simdBBox
-{
-    simdscalari ymin;
-    simdscalari ymax;
-    simdscalari xmin;
-    simdscalari xmax;
-};
-
-struct simd16BBox
-{
-    simd16scalari ymin;
-    simd16scalari ymax;
-    simd16scalari xmin;
-    simd16scalari xmax;
-};
-
-template <typename SIMD_T>
-struct SIMDBBOX_T
-{
-    typename SIMD_T::Integer ymin;
-    typename SIMD_T::Integer ymax;
-    typename SIMD_T::Integer xmin;
-    typename SIMD_T::Integer xmax;
-};
-
-// helper function to unroll loops
-template <int Begin, int End, int Step = 1>
-struct UnrollerL
-{
-    template <typename Lambda>
-    INLINE static void step(Lambda& func)
-    {
-        func(Begin);
-        UnrollerL<Begin + Step, End, Step>::step(func);
-    }
-};
-
-template <int End, int Step>
-struct UnrollerL<End, End, Step>
-{
-    template <typename Lambda>
-    static void step(Lambda& func)
-    {
-    }
-};
-
-// helper function to unroll loops, with mask to skip specific iterations
-template <int Begin, int End, int Step = 1, int Mask = 0x7f>
-struct UnrollerLMask
-{
-    template <typename Lambda>
-    INLINE static void step(Lambda& func)
-    {
-        if (Mask & (1 << Begin))
-        {
-            func(Begin);
-        }
-        UnrollerL<Begin + Step, End, Step>::step(func);
-    }
-};
-
-template <int End, int Step, int Mask>
-struct UnrollerLMask<End, End, Step, Mask>
-{
-    template <typename Lambda>
-    static void step(Lambda& func)
-    {
-    }
-};
-
-// general CRC compute
-INLINE
-uint32_t ComputeCRC(uint32_t crc, const void* pData, uint32_t size)
-{
-#if defined(_WIN64) || defined(__x86_64__)
-    uint32_t  sizeInQwords       = size / sizeof(uint64_t);
-    uint32_t  sizeRemainderBytes = size % sizeof(uint64_t);
-    uint64_t* pDataWords         = (uint64_t*)pData;
-    for (uint32_t i = 0; i < sizeInQwords; ++i)
-    {
-        crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
-    }
-#else
-    uint32_t  sizeInDwords       = size / sizeof(uint32_t);
-    uint32_t  sizeRemainderBytes = size % sizeof(uint32_t);
-    uint32_t* pDataWords         = (uint32_t*)pData;
-    for (uint32_t i = 0; i < sizeInDwords; ++i)
-    {
-        crc = _mm_crc32_u32(crc, *pDataWords++);
-    }
-#endif
-
-    uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
-    for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
-    {
-        crc = _mm_crc32_u8(crc, *pRemainderBytes++);
-    }
-
-    return crc;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Check specified bit within a data word
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-INLINE static bool CheckBit(T word, uint32_t bit)
-{
-    return 0 != (word & (T(1) << bit));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Add byte offset to any-type pointer
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-INLINE static T* PtrAdd(T* p, intptr_t offset)
-{
-    intptr_t intp = reinterpret_cast<intptr_t>(p);
-    return reinterpret_cast<T*>(intp + offset);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Is a power-of-2?
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-INLINE static bool IsPow2(T value)
-{
-    return value == (value & (T(0) - value));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Align down to specified alignment
-/// Note: IsPow2(alignment) MUST be true
-//////////////////////////////////////////////////////////////////////////
-template <typename T1, typename T2>
-INLINE static T1 AlignDownPow2(T1 value, T2 alignment)
-{
-    SWR_ASSERT(IsPow2(alignment));
-    return value & ~T1(alignment - 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Align up to specified alignment
-/// Note: IsPow2(alignment) MUST be true
-//////////////////////////////////////////////////////////////////////////
-template <typename T1, typename T2>
-INLINE static T1 AlignUpPow2(T1 value, T2 alignment)
-{
-    return AlignDownPow2(value + T1(alignment - 1), alignment);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Align up ptr to specified alignment
-/// Note: IsPow2(alignment) MUST be true
-//////////////////////////////////////////////////////////////////////////
-template <typename T1, typename T2>
-INLINE static T1* AlignUpPow2(T1* value, T2 alignment)
-{
-    return reinterpret_cast<T1*>(
-        AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Align down to specified alignment
-//////////////////////////////////////////////////////////////////////////
-template <typename T1, typename T2>
-INLINE static T1 AlignDown(T1 value, T2 alignment)
-{
-    if (IsPow2(alignment))
-    {
-        return AlignDownPow2(value, alignment);
-    }
-    return value - T1(value % alignment);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Align down to specified alignment
-//////////////////////////////////////////////////////////////////////////
-template <typename T1, typename T2>
-INLINE static T1* AlignDown(T1* value, T2 alignment)
-{
-    return (T1*)AlignDown(uintptr_t(value), alignment);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Align up to specified alignment
-/// Note: IsPow2(alignment) MUST be true
-//////////////////////////////////////////////////////////////////////////
-template <typename T1, typename T2>
-INLINE static T1 AlignUp(T1 value, T2 alignment)
-{
-    return AlignDown(value + T1(alignment - 1), alignment);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Align up to specified alignment
-/// Note: IsPow2(alignment) MUST be true
-//////////////////////////////////////////////////////////////////////////
-template <typename T1, typename T2>
-INLINE static T1* AlignUp(T1* value, T2 alignment)
-{
-    return AlignDown(PtrAdd(value, alignment - 1), alignment);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// Helper structure used to access an array of elements that don't
-/// correspond to a typical word size.
-//////////////////////////////////////////////////////////////////////////
-template <typename T, size_t BitsPerElementT, size_t ArrayLenT>
-class BitsArray
-{
-private:
-    static const size_t BITS_PER_WORD     = sizeof(size_t) * 8;
-    static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
-    static const size_t NUM_WORDS         = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
-    static const size_t ELEMENT_MASK      = (size_t(1) << BitsPerElementT) - 1;
-
-    static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
-                  "Element size must an integral fraction of pointer size");
-
-    size_t m_words[NUM_WORDS] = {};
-
-public:
-    T operator[](size_t elementIndex) const
-    {
-        size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
-        word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
-        return T(word & ELEMENT_MASK);
-    }
-};
-
-// Ranged integer argument for TemplateArgUnroller
-template <typename T, T TMin, T TMax>
-struct RangedArg
-{
-    T val;
-};
-
-template <uint32_t TMin, uint32_t TMax>
-using IntArg = RangedArg<uint32_t, TMin, TMax>;
-
-// Recursive template used to auto-nest conditionals.  Converts dynamic boolean function
-// arguments to static template arguments.
-template <typename TermT, typename... ArgsB>
-struct TemplateArgUnroller
-{
-    //-----------------------------------------
-    // Boolean value
-    //-----------------------------------------
-
-    // Last Arg Terminator
-    static typename TermT::FuncType GetFunc(bool bArg)
-    {
-        if (bArg)
-        {
-            return TermT::template GetFunc<ArgsB..., std::true_type>();
-        }
-
-        return TermT::template GetFunc<ArgsB..., std::false_type>();
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
-    {
-        if (bArg)
-        {
-            return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
-        }
-
-        return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
-    }
-
-    //-----------------------------------------
-    // Ranged value (within specified range)
-    //-----------------------------------------
-
-    // Last Arg Terminator
-    template <typename T, T TMin, T TMax>
-    static typename TermT::FuncType GetFunc(RangedArg<T, TMin, TMax> iArg)
-    {
-        if (iArg.val == TMax)
-        {
-            return TermT::template GetFunc<ArgsB..., std::integral_constant<T, TMax>>();
-        }
-        if (TMax > TMin)
-        {
-            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(
-                RangedArg<T, TMin, (T)(int(TMax) - 1)>{iArg.val});
-        }
-        SWR_ASSUME(false);
-        return nullptr;
-    }
-    template <typename T, T TVal>
-    static typename TermT::FuncType GetFunc(RangedArg<T, TVal, TVal> iArg)
-    {
-        SWR_ASSERT(iArg.val == TVal);
-        return TermT::template GetFunc<ArgsB..., std::integral_constant<T, TVal>>();
-    }
-
-    // Recursively parse args
-    template <typename T, T TMin, T TMax, typename... TArgsT>
-    static typename TermT::FuncType GetFunc(RangedArg<T, TMin, TMax> iArg, TArgsT... remainingArgs)
-    {
-        if (iArg.val == TMax)
-        {
-            return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TMax>>::GetFunc(
-                remainingArgs...);
-        }
-        if (TMax > TMin)
-        {
-            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(
-                RangedArg<T, TMin, (T)(int(TMax) - 1)>{iArg.val}, remainingArgs...);
-        }
-        SWR_ASSUME(false);
-        return nullptr;
-    }
-    template <typename T, T TVal, typename... TArgsT>
-    static typename TermT::FuncType GetFunc(RangedArg<T, TVal, TVal> iArg, TArgsT... remainingArgs)
-    {
-        SWR_ASSERT(iArg.val == TVal);
-        return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TVal>>::GetFunc(
-            remainingArgs...);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Helpers used to get / set environment variable
-//////////////////////////////////////////////////////////////////////////
-static INLINE std::string GetEnv(const std::string& variableName)
-{
-    std::string output;
-#if defined(_WIN32)
-    uint32_t valueSize = GetEnvironmentVariableA(variableName.c_str(), nullptr, 0);
-    if (!valueSize)
-        return output;
-    output.resize(valueSize - 1); // valueSize includes null, output.resize() does not
-    GetEnvironmentVariableA(variableName.c_str(), &output[0], valueSize);
-#else
-    char* env = getenv(variableName.c_str());
-    output    = env ? env : "";
-#endif
-
-    return output;
-}
-
-static INLINE void SetEnv(const std::string& variableName, const std::string& value)
-{
-#if defined(_WIN32)
-    SetEnvironmentVariableA(variableName.c_str(), value.c_str());
-#else
-    setenv(variableName.c_str(), value.c_str(), true);
-#endif
-}
-
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
deleted file mode 100644
index 44482939c76..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ /dev/null
@@ -1,853 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file JitManager.cpp
- *
- * @brief Implementation if the Jit Manager.
- *
- * Notes:
- *
- ******************************************************************************/
-#include "jit_pch.hpp"
-
-#include "JitManager.h"
-#include "jit_api.h"
-#include "fetch_jit.h"
-
-#include "core/state.h"
-
-#include "gen_state_llvm.h"
-
-#include <sstream>
-#if defined(_WIN32)
-#include <psapi.h>
-#include <cstring>
-
-#define INTEL_OUTPUT_DIR "c:\\Intel"
-#define SWR_OUTPUT_DIR INTEL_OUTPUT_DIR "\\SWR"
-#define JITTER_OUTPUT_DIR SWR_OUTPUT_DIR "\\Jitter"
-#endif // _WIN32
-
-#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-#include <pwd.h>
-#include <sys/stat.h>
-#endif
-
-
-using namespace llvm;
-using namespace SwrJit;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Contructor for JitManager.
-/// @param simdWidth - SIMD width to be used in generated program.
-JitManager::JitManager(uint32_t simdWidth, const char* arch, const char* core) :
-    mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth),
-    mArch(arch)
-{
-    mpCurrentModule = nullptr;
-    mpExec          = nullptr;
-
-    InitializeNativeTarget();
-    InitializeNativeTargetAsmPrinter();
-    InitializeNativeTargetDisassembler();
-
-
-    // force JIT to use the same CPU arch as the rest of swr
-    if (mArch.AVX512F())
-    {
-#if USE_SIMD16_SHADERS
-        if (mArch.AVX512ER())
-        {
-            mHostCpuName = StringRef("knl");
-        }
-        else
-        {
-            mHostCpuName = StringRef("skylake-avx512");
-        }
-        mUsingAVX512 = true;
-#else
-        mHostCpuName = StringRef("core-avx2");
-#endif
-        if (mVWidth == 0)
-        {
-            mVWidth = 8;
-        }
-    }
-    else if (mArch.AVX2())
-    {
-        mHostCpuName = StringRef("core-avx2");
-        if (mVWidth == 0)
-        {
-            mVWidth = 8;
-        }
-    }
-    else if (mArch.AVX())
-    {
-        if (mArch.F16C())
-        {
-            mHostCpuName = StringRef("core-avx-i");
-        }
-        else
-        {
-            mHostCpuName = StringRef("corei7-avx");
-        }
-        if (mVWidth == 0)
-        {
-            mVWidth = 8;
-        }
-    }
-    else
-    {
-        SWR_INVALID("Jitting requires at least AVX ISA support");
-    }
-
-
-    mOptLevel = CodeGenOpt::Aggressive;
-
-    if (KNOB_JIT_OPTIMIZATION_LEVEL >= CodeGenOpt::None &&
-        KNOB_JIT_OPTIMIZATION_LEVEL <= CodeGenOpt::Aggressive)
-    {
-        mOptLevel = CodeGenOpt::Level(KNOB_JIT_OPTIMIZATION_LEVEL);
-    }
-
-    if (KNOB_JIT_ENABLE_CACHE)
-    {
-        mCache.Init(this, mHostCpuName, mOptLevel);
-    }
-
-    SetupNewModule();
-    mIsModuleFinalized = true;
-
-    // fetch function signature
-#if USE_SIMD16_SHADERS
-    // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simd16vertex& out);
-#else
-    // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
-#endif
-    std::vector<Type*> fsArgs;
-
-    // llvm5 is picky and does not take a void * type
-    fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0));
-
-    fsArgs.push_back(Type::getInt8PtrTy(mContext));
-
-    fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0));
-#if USE_SIMD16_SHADERS
-    fsArgs.push_back(PointerType::get(Gen_simd16vertex(this), 0));
-#else
-    fsArgs.push_back(PointerType::get(Gen_simdvertex(this), 0));
-#endif
-
-    mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, false);
-
-#if defined(_MSC_VER)
-    // explicitly instantiate used symbols from potentially staticly linked libs
-    sys::DynamicLibrary::AddSymbol("exp2f", &exp2f);
-    sys::DynamicLibrary::AddSymbol("log2f", &log2f);
-    sys::DynamicLibrary::AddSymbol("sinf", &sinf);
-    sys::DynamicLibrary::AddSymbol("cosf", &cosf);
-    sys::DynamicLibrary::AddSymbol("powf", &powf);
-#endif
-
-#if defined(_WIN32)
-    if (KNOB_DUMP_SHADER_IR)
-    {
-        CreateDirectoryPath(INTEL_OUTPUT_DIR);
-        CreateDirectoryPath(SWR_OUTPUT_DIR);
-        CreateDirectoryPath(JITTER_OUTPUT_DIR);
-    }
-#endif
-}
-
-void JitManager::CreateExecEngine(std::unique_ptr<Module> pModule)
-{
-    TargetOptions tOpts;
-    tOpts.AllowFPOpFusion = FPOpFusion::Fast;
-    tOpts.NoInfsFPMath    = false;
-    tOpts.NoNaNsFPMath    = false;
-    tOpts.UnsafeFPMath = false;
-
-    // tOpts.PrintMachineCode    = true;
-
-    mpExec = EngineBuilder(std::move(pModule))
-                 .setTargetOptions(tOpts)
-                 .setOptLevel(mOptLevel)
-                 .setMCPU(mHostCpuName)
-                 .create();
-
-    if (KNOB_JIT_ENABLE_CACHE)
-    {
-        mpExec->setObjectCache(&mCache);
-    }
-
-#if LLVM_USE_INTEL_JITEVENTS
-    JITEventListener* vTune = JITEventListener::createIntelJITEventListener();
-    mpExec->RegisterJITEventListener(vTune);
-#endif
-
-    mvExecEngines.push_back(mpExec);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Create new LLVM module.
-void JitManager::SetupNewModule()
-{
-    SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!");
-
-    std::unique_ptr<Module> newModule(new Module("", mContext));
-    mpCurrentModule = newModule.get();
-    mpCurrentModule->setTargetTriple(sys::getProcessTriple());
-    CreateExecEngine(std::move(newModule));
-    mIsModuleFinalized = false;
-}
-
-
-DIType*
-JitManager::CreateDebugStructType(StructType*                                          pType,
-                                  const std::string&                                   name,
-                                  DIFile*                                              pFile,
-                                  uint32_t                                             lineNum,
-                                  const std::vector<std::pair<std::string, uint32_t>>& members)
-{
-    DIBuilder                 builder(*mpCurrentModule);
-    SmallVector<Metadata*, 8> ElemTypes;
-    DataLayout                DL        = DataLayout(mpCurrentModule);
-    uint32_t                  size      = DL.getTypeAllocSizeInBits(pType);
-    uint32_t                  alignment = DL.getABITypeAlignment(pType);
-    DINode::DIFlags           flags     = DINode::DIFlags::FlagPublic;
-
-    DICompositeType* pDIStructTy = builder.createStructType(pFile,
-                                                            name,
-                                                            pFile,
-                                                            lineNum,
-                                                            size,
-                                                            alignment,
-                                                            flags,
-                                                            nullptr,
-                                                            builder.getOrCreateArray(ElemTypes));
-
-    // Register mapping now to break loops (in case struct contains itself or pointers to itself)
-    mDebugStructMap[pType] = pDIStructTy;
-
-    uint32_t idx = 0;
-    for (auto& elem : pType->elements())
-    {
-        std::string name       = members[idx].first;
-        uint32_t    lineNum    = members[idx].second;
-        size                   = DL.getTypeAllocSizeInBits(elem);
-        alignment              = DL.getABITypeAlignment(elem);
-        uint32_t      offset   = DL.getStructLayout(pType)->getElementOffsetInBits(idx);
-        llvm::DIType* pDebugTy = GetDebugType(elem);
-        ElemTypes.push_back(builder.createMemberType(
-            pDIStructTy, name, pFile, lineNum, size, alignment, offset, flags, pDebugTy));
-
-        idx++;
-    }
-
-    pDIStructTy->replaceElements(builder.getOrCreateArray(ElemTypes));
-    return pDIStructTy;
-}
-
-DIType* JitManager::GetDebugArrayType(Type* pTy)
-{
-    DIBuilder  builder(*mpCurrentModule);
-    DataLayout DL        = DataLayout(mpCurrentModule);
-    ArrayType* pArrayTy  = cast<ArrayType>(pTy);
-    uint32_t   size      = DL.getTypeAllocSizeInBits(pArrayTy);
-    uint32_t   alignment = DL.getABITypeAlignment(pArrayTy);
-
-    SmallVector<Metadata*, 8> Elems;
-    Elems.push_back(builder.getOrCreateSubrange(0, pArrayTy->getNumElements()));
-    return builder.createArrayType(
-        size, alignment, GetDebugType(pArrayTy->getElementType()), builder.getOrCreateArray(Elems));
-}
-
-// Create a DIType from llvm Type
-DIType* JitManager::GetDebugType(Type* pTy)
-{
-    DIBuilder    builder(*mpCurrentModule);
-    Type::TypeID id = pTy->getTypeID();
-
-    switch (id)
-    {
-    case Type::VoidTyID:
-        return builder.createUnspecifiedType("void");
-        break;
-    case Type::HalfTyID:
-        return builder.createBasicType("float16", 16, dwarf::DW_ATE_float);
-        break;
-    case Type::FloatTyID:
-        return builder.createBasicType("float", 32, dwarf::DW_ATE_float);
-        break;
-    case Type::DoubleTyID:
-        return builder.createBasicType("double", 64, dwarf::DW_ATE_float);
-        break;
-    case Type::IntegerTyID:
-        return GetDebugIntegerType(pTy);
-        break;
-    case Type::StructTyID:
-        return GetDebugStructType(pTy);
-        break;
-    case Type::ArrayTyID:
-        return GetDebugArrayType(pTy);
-        break;
-    case Type::PointerTyID:
-        return builder.createPointerType(GetDebugType(pTy->getPointerElementType()), 64, 64);
-        break;
-#if LLVM_VERSION_MAJOR >= 11
-    case Type::FixedVectorTyID:
-#else
-    case Type::VectorTyID:
-#endif
-        return GetDebugVectorType(pTy);
-        break;
-    case Type::FunctionTyID:
-        return GetDebugFunctionType(pTy);
-        break;
-    default:
-        SWR_ASSERT(false, "Unimplemented llvm type");
-    }
-    return nullptr;
-}
-
-// Create a DISubroutineType from an llvm FunctionType
-DIType* JitManager::GetDebugFunctionType(Type* pTy)
-{
-    SmallVector<Metadata*, 8> ElemTypes;
-    FunctionType*             pFuncTy = cast<FunctionType>(pTy);
-    DIBuilder                 builder(*mpCurrentModule);
-
-    // Add result type
-    ElemTypes.push_back(GetDebugType(pFuncTy->getReturnType()));
-
-    // Add arguments
-    for (auto& param : pFuncTy->params())
-    {
-        ElemTypes.push_back(GetDebugType(param));
-    }
-
-    return builder.createSubroutineType(builder.getOrCreateTypeArray(ElemTypes));
-}
-
-DIType* JitManager::GetDebugIntegerType(Type* pTy)
-{
-    DIBuilder    builder(*mpCurrentModule);
-    IntegerType* pIntTy = cast<IntegerType>(pTy);
-    switch (pIntTy->getBitWidth())
-    {
-    case 1:
-        return builder.createBasicType("int1", 1, dwarf::DW_ATE_unsigned);
-        break;
-    case 8:
-        return builder.createBasicType("int8", 8, dwarf::DW_ATE_signed);
-        break;
-    case 16:
-        return builder.createBasicType("int16", 16, dwarf::DW_ATE_signed);
-        break;
-    case 32:
-        return builder.createBasicType("int", 32, dwarf::DW_ATE_signed);
-        break;
-    case 64:
-        return builder.createBasicType("int64", 64, dwarf::DW_ATE_signed);
-        break;
-    case 128:
-        return builder.createBasicType("int128", 128, dwarf::DW_ATE_signed);
-        break;
-    default:
-        SWR_ASSERT(false, "Unimplemented integer bit width");
-    }
-    return nullptr;
-}
-
-DIType* JitManager::GetDebugVectorType(Type* pTy)
-{
-    DIBuilder                 builder(*mpCurrentModule);
-#if LLVM_VERSION_MAJOR >= 12
-    FixedVectorType*          pVecTy    = cast<FixedVectorType>(pTy);
-#elif LLVM_VERSION_MAJOR >= 11
-    VectorType*               pVecTy    = cast<VectorType>(pTy);
-#else
-    auto                      pVecTy    = pTy;
-#endif
-    DataLayout                DL        = DataLayout(mpCurrentModule);
-    uint32_t                  size      = DL.getTypeAllocSizeInBits(pVecTy);
-    uint32_t                  alignment = DL.getABITypeAlignment(pVecTy);
-    SmallVector<Metadata*, 1> Elems;
-
-#if LLVM_VERSION_MAJOR >= 11
-    Elems.push_back(builder.getOrCreateSubrange(0, pVecTy->getNumElements()));
-#else
-    Elems.push_back(builder.getOrCreateSubrange(0, pVecTy->getVectorNumElements()));
-#endif
-
-    return builder.createVectorType(size,
-                                    alignment,
-#if LLVM_VERSION_MAJOR >= 11
-                                    GetDebugType(pVecTy->getElementType()),
-#else
-                                    GetDebugType(pVecTy->getVectorElementType()),
-#endif
-                                    builder.getOrCreateArray(Elems));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Dump function x86 assembly to file.
-/// @note This should only be called after the module has been jitted to x86 and the
-///       module will not be further accessed.
-void JitManager::DumpAsm(Function* pFunction, const char* fileName)
-{
-    if (KNOB_DUMP_SHADER_IR)
-    {
-#if defined(_WIN32)
-        DWORD pid = GetCurrentProcessId();
-        char  procname[MAX_PATH];
-        GetModuleFileNameA(NULL, procname, MAX_PATH);
-        const char*       pBaseName = strrchr(procname, '\\');
-        std::stringstream outDir;
-        outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
-        CreateDirectoryPath(outDir.str().c_str());
-#endif
-
-        std::error_code EC;
-        Module*         pModule  = pFunction->getParent();
-        const char*     funcName = pFunction->getName().data();
-        char            fName[256];
-#if defined(_WIN32)
-        sprintf(fName, "%s\\%s.%s.asm", outDir.str().c_str(), funcName, fileName);
-#else
-        sprintf(fName, "%s.%s.asm", funcName, fileName);
-#endif
-
-        raw_fd_ostream filestream(fName, EC, llvm::sys::fs::F_None);
-
-        legacy::PassManager* pMPasses         = new legacy::PassManager();
-        auto*                pTarget          = mpExec->getTargetMachine();
-        pTarget->Options.MCOptions.AsmVerbose = true;
-#if LLVM_VERSION_MAJOR >= 10
-        pTarget->addPassesToEmitFile(
-            *pMPasses, filestream, nullptr, CGFT_AssemblyFile);
-#elif LLVM_VERSION_MAJOR >= 7
-        pTarget->addPassesToEmitFile(
-            *pMPasses, filestream, nullptr, TargetMachine::CGFT_AssemblyFile);
-#else
-        pTarget->addPassesToEmitFile(*pMPasses, filestream, TargetMachine::CGFT_AssemblyFile);
-#endif
-        pMPasses->run(*pModule);
-        delete pMPasses;
-        pTarget->Options.MCOptions.AsmVerbose = false;
-    }
-}
-
-std::string JitManager::GetOutputDir()
-{
-#if defined(_WIN32)
-    DWORD pid = GetCurrentProcessId();
-    char  procname[MAX_PATH];
-    GetModuleFileNameA(NULL, procname, MAX_PATH);
-    const char*       pBaseName = strrchr(procname, '\\');
-    std::stringstream outDir;
-    outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid;
-    CreateDirectoryPath(outDir.str().c_str());
-    return outDir.str();
-#endif
-    return "";
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Dump function to file.
-void JitManager::DumpToFile(Module*                         M,
-                            const char*                     fileName,
-                            llvm::AssemblyAnnotationWriter* annotater)
-{
-    if (KNOB_DUMP_SHADER_IR)
-    {
-        std::string outDir = GetOutputDir();
-
-        std::error_code EC;
-        const char*     funcName = M->getName().data();
-        char            fName[256];
-#if defined(_WIN32)
-        sprintf(fName, "%s\\%s.%s.ll", outDir.c_str(), funcName, fileName);
-#else
-        sprintf(fName, "%s.%s.ll", funcName, fileName);
-#endif
-        raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None);
-        M->print(fd, annotater);
-        fd.flush();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Dump function to file.
-void JitManager::DumpToFile(Function* f, const char* fileName)
-{
-    if (KNOB_DUMP_SHADER_IR)
-    {
-        std::string outDir = GetOutputDir();
-
-        std::error_code EC;
-        const char*     funcName = f->getName().data();
-        char            fName[256];
-#if defined(_WIN32)
-        sprintf(fName, "%s\\%s.%s.ll", outDir.c_str(), funcName, fileName);
-#else
-        sprintf(fName, "%s.%s.ll", funcName, fileName);
-#endif
-        raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None);
-        f->print(fd, nullptr);
-
-#if defined(_WIN32)
-        sprintf(fName, "%s\\cfg.%s.%s.dot", outDir.c_str(), funcName, fileName);
-#else
-        sprintf(fName, "cfg.%s.%s.dot", funcName, fileName);
-#endif
-        fd.flush();
-
-        raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text);
-        WriteGraph(fd_cfg, (const Function*)f);
-
-        fd_cfg.flush();
-    }
-}
-
-extern "C" {
-bool g_DllActive = true;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Create JIT context.
-/// @param simdWidth - SIMD width to be used in generated program.
-HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch, const char* core)
-{
-    return new JitManager(targetSimdWidth, arch, core);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Destroy JIT context.
-void JITCALL JitDestroyContext(HANDLE hJitContext)
-{
-    if (g_DllActive)
-    {
-        delete reinterpret_cast<JitManager*>(hJitContext);
-    }
-}
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// JitCache
-//////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////
-/// JitCacheFileHeader
-//////////////////////////////////////////////////////////////////////////
-struct JitCacheFileHeader
-{
-    void Init(uint32_t           llCRC,
-              uint32_t           objCRC,
-              const std::string& moduleID,
-              const std::string& cpu,
-              uint32_t           optLevel,
-              uint64_t           objSize)
-    {
-        m_objSize = objSize;
-        m_llCRC   = llCRC;
-        m_objCRC  = objCRC;
-        strncpy(m_ModuleID, moduleID.c_str(), JC_STR_MAX_LEN - 1);
-        m_ModuleID[JC_STR_MAX_LEN - 1] = 0;
-        strncpy(m_Cpu, cpu.c_str(), JC_STR_MAX_LEN - 1);
-        m_Cpu[JC_STR_MAX_LEN - 1] = 0;
-        m_optLevel                = optLevel;
-    }
-
-
-    bool
-    IsValid(uint32_t llCRC, const std::string& moduleID, const std::string& cpu, uint32_t optLevel)
-    {
-        if ((m_MagicNumber != JC_MAGIC_NUMBER) || (m_llCRC != llCRC) ||
-            (m_platformKey != JC_PLATFORM_KEY) || (m_optLevel != optLevel))
-        {
-            return false;
-        }
-
-        m_ModuleID[JC_STR_MAX_LEN - 1] = 0;
-        if (strncmp(moduleID.c_str(), m_ModuleID, JC_STR_MAX_LEN - 1))
-        {
-            return false;
-        }
-
-        m_Cpu[JC_STR_MAX_LEN - 1] = 0;
-        if (strncmp(cpu.c_str(), m_Cpu, JC_STR_MAX_LEN - 1))
-        {
-            return false;
-        }
-
-        return true;
-    }
-
-    uint64_t GetObjectSize() const { return m_objSize; }
-    uint64_t GetObjectCRC() const { return m_objCRC; }
-
-private:
-    static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543210ULL + 7;
-    static const size_t   JC_STR_MAX_LEN  = 32;
-    static const uint32_t JC_PLATFORM_KEY = (LLVM_VERSION_MAJOR << 24) |
-                                            (LLVM_VERSION_MINOR << 16) | (LLVM_VERSION_PATCH << 8) |
-                                            ((sizeof(void*) > sizeof(uint32_t)) ? 1 : 0);
-
-    uint64_t m_MagicNumber              = JC_MAGIC_NUMBER;
-    uint64_t m_objSize                  = 0;
-    uint32_t m_llCRC                    = 0;
-    uint32_t m_platformKey              = JC_PLATFORM_KEY;
-    uint32_t m_objCRC                   = 0;
-    uint32_t m_optLevel                 = 0;
-    char     m_ModuleID[JC_STR_MAX_LEN] = {};
-    char     m_Cpu[JC_STR_MAX_LEN]      = {};
-};
-
-static inline uint32_t ComputeModuleCRC(const llvm::Module* M)
-{
-    std::string        bitcodeBuffer;
-    raw_string_ostream bitcodeStream(bitcodeBuffer);
-
-#if LLVM_VERSION_MAJOR >= 7
-    llvm::WriteBitcodeToFile(*M, bitcodeStream);
-#else
-    llvm::WriteBitcodeToFile(M, bitcodeStream);
-#endif
-    // M->print(bitcodeStream, nullptr, false);
-
-    bitcodeStream.flush();
-
-    return ComputeCRC(0, bitcodeBuffer.data(), bitcodeBuffer.size());
-}
-
-/// constructor
-JitCache::JitCache()
-{
-#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-    if (strncmp(KNOB_JIT_CACHE_DIR.c_str(), "~/", 2) == 0)
-    {
-        char* homedir;
-        if (!(homedir = getenv("HOME")))
-        {
-            homedir = getpwuid(getuid())->pw_dir;
-        }
-        mCacheDir = homedir;
-        mCacheDir += (KNOB_JIT_CACHE_DIR.c_str() + 1);
-    }
-    else
-#endif
-    {
-        mCacheDir = KNOB_JIT_CACHE_DIR;
-    }
-
-    // Create cache dir at startup to allow jitter to write debug.ll files
-    // to that directory.
-    if (!llvm::sys::fs::exists(mCacheDir.str()) &&
-        llvm::sys::fs::create_directories(mCacheDir.str()))
-    {
-        SWR_INVALID("Unable to create directory: %s", mCacheDir.c_str());
-    }
-
-}
-
-int ExecUnhookedProcess(const std::string& CmdLine, std::string* pStdOut, std::string* pStdErr)
-{
-
-    return ExecCmd(CmdLine, nullptr, pStdOut, pStdErr);
-}
-
-/// Calculate actual directory where module will be cached.
-/// This is always a subdirectory of mCacheDir.  Full absolute
-/// path name will be stored in mCurrentModuleCacheDir
-void JitCache::CalcModuleCacheDir()
-{
-    mModuleCacheDir.clear();
-
-    llvm::SmallString<MAX_PATH> moduleDir = mCacheDir;
-
-    // Create 4 levels of directory hierarchy based on CRC, 256 entries each
-    uint8_t* pCRC = (uint8_t*)&mCurrentModuleCRC;
-    for (uint32_t i = 0; i < 4; ++i)
-    {
-        llvm::sys::path::append(moduleDir, std::to_string((int)pCRC[i]));
-    }
-
-    mModuleCacheDir = moduleDir;
-}
-
-/// notifyObjectCompiled - Provides a pointer to compiled code for Module M.
-void JitCache::notifyObjectCompiled(const llvm::Module* M, llvm::MemoryBufferRef Obj)
-{
-    const std::string& moduleID = M->getModuleIdentifier();
-    if (!moduleID.length())
-    {
-        return;
-    }
-
-    if (!mModuleCacheDir.size())
-    {
-        SWR_INVALID("Unset module cache directory");
-        return;
-    }
-
-    if (!llvm::sys::fs::exists(mModuleCacheDir.str()) &&
-        llvm::sys::fs::create_directories(mModuleCacheDir.str()))
-    {
-        SWR_INVALID("Unable to create directory: %s", mModuleCacheDir.c_str());
-        return;
-    }
-
-    JitCacheFileHeader header;
-
-    llvm::SmallString<MAX_PATH> filePath = mModuleCacheDir;
-    llvm::sys::path::append(filePath, moduleID);
-
-    llvm::SmallString<MAX_PATH> objPath = filePath;
-    objPath += JIT_OBJ_EXT;
-
-    {
-        std::error_code      err;
-        llvm::raw_fd_ostream fileObj(objPath.c_str(), err, llvm::sys::fs::F_None);
-        fileObj << Obj.getBuffer();
-        fileObj.flush();
-    }
-
-
-    {
-        std::error_code      err;
-        llvm::raw_fd_ostream fileObj(filePath.c_str(), err, llvm::sys::fs::F_None);
-
-        uint32_t objcrc = ComputeCRC(0, Obj.getBufferStart(), Obj.getBufferSize());
-
-        header.Init(mCurrentModuleCRC, objcrc, moduleID, mCpu, mOptLevel, Obj.getBufferSize());
-
-        fileObj.write((const char*)&header, sizeof(header));
-        fileObj.flush();
-    }
-}
-
-/// Returns a pointer to a newly allocated MemoryBuffer that contains the
-/// object which corresponds with Module M, or 0 if an object is not
-/// available.
-std::unique_ptr<llvm::MemoryBuffer> JitCache::getObject(const llvm::Module* M)
-{
-    const std::string& moduleID = M->getModuleIdentifier();
-    mCurrentModuleCRC           = ComputeModuleCRC(M);
-
-    if (!moduleID.length())
-    {
-        return nullptr;
-    }
-
-    CalcModuleCacheDir();
-
-    if (!llvm::sys::fs::exists(mModuleCacheDir))
-    {
-        return nullptr;
-    }
-
-    llvm::SmallString<MAX_PATH> filePath = mModuleCacheDir;
-    llvm::sys::path::append(filePath, moduleID);
-
-    llvm::SmallString<MAX_PATH> objFilePath = filePath;
-    objFilePath += JIT_OBJ_EXT;
-
-    FILE* fpObjIn = nullptr;
-    FILE* fpIn    = fopen(filePath.c_str(), "rb");
-    if (!fpIn)
-    {
-        return nullptr;
-    }
-
-    std::unique_ptr<llvm::MemoryBuffer> pBuf = nullptr;
-    do
-    {
-        JitCacheFileHeader header;
-        if (!fread(&header, sizeof(header), 1, fpIn))
-        {
-            break;
-        }
-
-        if (!header.IsValid(mCurrentModuleCRC, moduleID, mCpu, mOptLevel))
-        {
-            break;
-        }
-
-        fpObjIn = fopen(objFilePath.c_str(), "rb");
-        if (!fpObjIn)
-        {
-            break;
-        }
-
-#if LLVM_VERSION_MAJOR < 6
-        pBuf = llvm::MemoryBuffer::getNewUninitMemBuffer(size_t(header.GetObjectSize()));
-#else
-        pBuf = llvm::WritableMemoryBuffer::getNewUninitMemBuffer(size_t(header.GetObjectSize()));
-#endif
-        if (!fread(const_cast<char*>(pBuf->getBufferStart()), header.GetObjectSize(), 1, fpObjIn))
-        {
-            pBuf = nullptr;
-            break;
-        }
-
-        if (header.GetObjectCRC() != ComputeCRC(0, pBuf->getBufferStart(), pBuf->getBufferSize()))
-        {
-            SWR_TRACE("Invalid object cache file, ignoring: %s", filePath.c_str());
-            pBuf = nullptr;
-            break;
-        }
-
-    } while (0);
-
-    fclose(fpIn);
-
-    if (fpObjIn)
-    {
-        fclose(fpObjIn);
-    }
-
-
-    return pBuf;
-}
-
-void InterleaveAssemblyAnnotater::emitInstructionAnnot(const llvm::Instruction*     pInst,
-                                                       llvm::formatted_raw_ostream& OS)
-{
-    auto dbgLoc = pInst->getDebugLoc();
-    if (dbgLoc)
-    {
-        unsigned int line = dbgLoc.getLine();
-        if (line != mCurrentLineNo)
-        {
-            if (line > 0 && line <= mAssembly.size())
-            {
-                // HACK: here we assume that OS is a formatted_raw_ostream(ods())
-                // and modify the color accordingly. We can't do the color
-                // modification on OS because formatted_raw_ostream strips
-                // the color information. The only way to fix this behavior
-                // is to patch LLVM.
-                OS << "\n; " << line << ": " << mAssembly[line - 1] << "\n";
-            }
-            mCurrentLineNo = line;
-        }
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
deleted file mode 100644
index d96d22e1b95..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file JitManager.h
- *
- * @brief JitManager contains the LLVM data structures used for JIT generation
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-#include "jit_pch.hpp"
-#include "common/isa.hpp"
-#include <llvm/IR/AssemblyAnnotationWriter.h>
-
-
-//////////////////////////////////////////////////////////////////////////
-/// JitInstructionSet
-/// @brief Subclass of InstructionSet that allows users to override
-/// the reporting of support for certain ISA features.  This allows capping
-/// the jitted code to a certain feature level, e.g. jit AVX level code on
-/// a platform that supports AVX2.
-//////////////////////////////////////////////////////////////////////////
-class JitInstructionSet : public InstructionSet
-{
-public:
-    JitInstructionSet(const char* requestedIsa) : isaRequest(requestedIsa)
-    {
-        std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower);
-
-        if (isaRequest == "avx")
-        {
-            bForceAVX    = true;
-            bForceAVX2   = false;
-            bForceAVX512 = false;
-        }
-        else if (isaRequest == "avx2")
-        {
-            bForceAVX    = false;
-            bForceAVX2   = true;
-            bForceAVX512 = false;
-        }
-        else if (isaRequest == "avx512")
-        {
-            bForceAVX    = false;
-            bForceAVX2   = false;
-            bForceAVX512 = true;
-        }
-    };
-
-    bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); }
-    bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512F(); }
-    bool AVX512ER(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512ER(); }
-    bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); }
-
-private:
-    bool        bForceAVX    = false;
-    bool        bForceAVX2   = false;
-    bool        bForceAVX512 = false;
-    std::string isaRequest;
-};
-
-struct JitLLVMContext : llvm::LLVMContext
-{
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// JitCache
-//////////////////////////////////////////////////////////////////////////
-struct JitManager; // Forward Decl
-class JitCache : public llvm::ObjectCache
-{
-public:
-    /// constructor
-    JitCache();
-    virtual ~JitCache() {}
-
-    void Init(JitManager* pJitMgr, const llvm::StringRef& cpu, llvm::CodeGenOpt::Level level)
-    {
-        mCpu      = cpu.str();
-        mpJitMgr  = pJitMgr;
-        mOptLevel = level;
-    }
-
-    /// notifyObjectCompiled - Provides a pointer to compiled code for Module M.
-    void notifyObjectCompiled(const llvm::Module* M, llvm::MemoryBufferRef Obj) override;
-
-    /// Returns a pointer to a newly allocated MemoryBuffer that contains the
-    /// object which corresponds with Module M, or 0 if an object is not
-    /// available.
-    std::unique_ptr<llvm::MemoryBuffer> getObject(const llvm::Module* M) override;
-
-    const char* GetModuleCacheDir() { return mModuleCacheDir.c_str(); }
-
-private:
-    std::string                 mCpu;
-    llvm::SmallString<MAX_PATH> mCacheDir;
-    llvm::SmallString<MAX_PATH> mModuleCacheDir;
-    uint32_t                    mCurrentModuleCRC = 0;
-    JitManager*                 mpJitMgr          = nullptr;
-    llvm::CodeGenOpt::Level     mOptLevel         = llvm::CodeGenOpt::None;
-
-    /// Calculate actual directory where module will be cached.
-    /// This is always a subdirectory of mCacheDir.  Full absolute
-    /// path name will be stored in mCurrentModuleCacheDir
-    void CalcModuleCacheDir();
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// JitManager
-//////////////////////////////////////////////////////////////////////////
-struct JitManager
-{
-    JitManager(uint32_t w, const char* arch, const char* core);
-    ~JitManager()
-    {
-        for (auto* pExec : mvExecEngines)
-        {
-            delete pExec;
-        }
-    }
-
-    JitLLVMContext                      mContext; ///< LLVM compiler
-    llvm::IRBuilder<>                   mBuilder; ///< LLVM IR Builder
-    llvm::ExecutionEngine*              mpExec;
-    std::vector<llvm::ExecutionEngine*> mvExecEngines;
-    JitCache                            mCache;
-    llvm::StringRef                     mHostCpuName;
-    llvm::CodeGenOpt::Level             mOptLevel;
-
-    // Need to be rebuilt after a JIT and before building new IR
-    llvm::Module* mpCurrentModule;
-    bool          mIsModuleFinalized;
-    uint32_t      mJitNumber;
-
-    uint32_t mVWidth;
-
-    bool mUsingAVX512 = false;
-
-    // fetch shader types
-    llvm::FunctionType* mFetchShaderTy;
-
-    JitInstructionSet mArch;
-
-    // Debugging support
-    std::unordered_map<llvm::StructType*, llvm::DIType*> mDebugStructMap;
-
-    void CreateExecEngine(std::unique_ptr<llvm::Module> M);
-    void SetupNewModule();
-
-    void               DumpAsm(llvm::Function* pFunction, const char* fileName);
-    static void        DumpToFile(llvm::Function* f, const char* fileName);
-    static void        DumpToFile(llvm::Module*                   M,
-                                  const char*                     fileName,
-                                  llvm::AssemblyAnnotationWriter* annotater = nullptr);
-    static std::string GetOutputDir();
-
-    // Debugging support methods
-    llvm::DIType* GetDebugType(llvm::Type* pTy);
-    llvm::DIType* GetDebugIntegerType(llvm::Type* pTy);
-    llvm::DIType* GetDebugArrayType(llvm::Type* pTy);
-    llvm::DIType* GetDebugVectorType(llvm::Type* pTy);
-    llvm::DIType* GetDebugFunctionType(llvm::Type* pTy);
-
-    llvm::DIType* GetDebugStructType(llvm::Type* pType)
-    {
-        llvm::StructType* pStructTy = llvm::cast<llvm::StructType>(pType);
-        if (mDebugStructMap.find(pStructTy) == mDebugStructMap.end())
-        {
-            return nullptr;
-        }
-        return mDebugStructMap[pStructTy];
-    }
-
-    llvm::DIType*
-    CreateDebugStructType(llvm::StructType*                                    pType,
-                          const std::string&                                   name,
-                          llvm::DIFile*                                        pFile,
-                          uint32_t                                             lineNum,
-                          const std::vector<std::pair<std::string, uint32_t>>& members);
-};
-
-class InterleaveAssemblyAnnotater : public llvm::AssemblyAnnotationWriter
-{
-public:
-    void                     emitInstructionAnnot(const llvm::Instruction*     pInst,
-                                                  llvm::formatted_raw_ostream& OS) override;
-    std::vector<std::string> mAssembly;
-
-private:
-    uint32_t mCurrentLineNo = 0;
-};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
deleted file mode 100644
index 80959809806..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ /dev/null
@@ -1,924 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file blend_jit.cpp
- *
- * @brief Implementation of the blend jitter
- *
- * Notes:
- *
- ******************************************************************************/
-#include "jit_pch.hpp"
-#include "builder.h"
-#include "jit_api.h"
-#include "blend_jit.h"
-#include "gen_state_llvm.h"
-#include "functionpasses/passes.h"
-
-#include "util/compiler.h"
-
-// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
-#define QUANTIZE_THRESHOLD 2
-
-using namespace llvm;
-using namespace SwrJit;
-
-//////////////////////////////////////////////////////////////////////////
-/// Interface to Jitting a blend shader
-//////////////////////////////////////////////////////////////////////////
-struct BlendJit : public Builder
-{
-    BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
-
-    template <bool Color, bool Alpha>
-    void GenerateBlendFactor(SWR_BLEND_FACTOR factor,
-                             Value*           constColor[4],
-                             Value*           src[4],
-                             Value*           src1[4],
-                             Value*           dst[4],
-                             Value*           result[4])
-    {
-        Value* out[4];
-
-        switch (factor)
-        {
-        case BLENDFACTOR_ONE:
-            out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
-            break;
-        case BLENDFACTOR_SRC_COLOR:
-            out[0] = src[0];
-            out[1] = src[1];
-            out[2] = src[2];
-            out[3] = src[3];
-            break;
-        case BLENDFACTOR_SRC_ALPHA:
-            out[0] = out[1] = out[2] = out[3] = src[3];
-            break;
-        case BLENDFACTOR_DST_ALPHA:
-            out[0] = out[1] = out[2] = out[3] = dst[3];
-            break;
-        case BLENDFACTOR_DST_COLOR:
-            out[0] = dst[0];
-            out[1] = dst[1];
-            out[2] = dst[2];
-            out[3] = dst[3];
-            break;
-        case BLENDFACTOR_SRC_ALPHA_SATURATE:
-            out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
-            out[3]                   = VIMMED1(1.0f);
-            break;
-        case BLENDFACTOR_CONST_COLOR:
-            out[0] = constColor[0];
-            out[1] = constColor[1];
-            out[2] = constColor[2];
-            out[3] = constColor[3];
-            break;
-        case BLENDFACTOR_CONST_ALPHA:
-            out[0] = out[1] = out[2] = out[3] = constColor[3];
-            break;
-        case BLENDFACTOR_SRC1_COLOR:
-            out[0] = src1[0];
-            out[1] = src1[1];
-            out[2] = src1[2];
-            out[3] = src1[3];
-            break;
-        case BLENDFACTOR_SRC1_ALPHA:
-            out[0] = out[1] = out[2] = out[3] = src1[3];
-            break;
-        case BLENDFACTOR_ZERO:
-            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
-            break;
-        case BLENDFACTOR_INV_SRC_COLOR:
-            out[0] = FSUB(VIMMED1(1.0f), src[0]);
-            out[1] = FSUB(VIMMED1(1.0f), src[1]);
-            out[2] = FSUB(VIMMED1(1.0f), src[2]);
-            out[3] = FSUB(VIMMED1(1.0f), src[3]);
-            break;
-        case BLENDFACTOR_INV_SRC_ALPHA:
-            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
-            break;
-        case BLENDFACTOR_INV_DST_ALPHA:
-            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
-            break;
-        case BLENDFACTOR_INV_DST_COLOR:
-            out[0] = FSUB(VIMMED1(1.0f), dst[0]);
-            out[1] = FSUB(VIMMED1(1.0f), dst[1]);
-            out[2] = FSUB(VIMMED1(1.0f), dst[2]);
-            out[3] = FSUB(VIMMED1(1.0f), dst[3]);
-            break;
-        case BLENDFACTOR_INV_CONST_COLOR:
-            out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
-            out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
-            out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
-            out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
-            break;
-        case BLENDFACTOR_INV_CONST_ALPHA:
-            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
-            break;
-        case BLENDFACTOR_INV_SRC1_COLOR:
-            out[0] = FSUB(VIMMED1(1.0f), src1[0]);
-            out[1] = FSUB(VIMMED1(1.0f), src1[1]);
-            out[2] = FSUB(VIMMED1(1.0f), src1[2]);
-            out[3] = FSUB(VIMMED1(1.0f), src1[3]);
-            break;
-        case BLENDFACTOR_INV_SRC1_ALPHA:
-            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
-            break;
-        default:
-            SWR_INVALID("Unsupported blend factor: %d", factor);
-            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
-            break;
-        }
-
-        if (Color)
-        {
-            result[0] = out[0];
-            result[1] = out[1];
-            result[2] = out[2];
-        }
-
-        if (Alpha)
-        {
-            result[3] = out[3];
-        }
-    }
-
-    void Clamp(SWR_FORMAT format, Value* src[4])
-    {
-        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-        SWR_TYPE               type = info.type[0];
-
-        switch (type)
-        {
-        default:
-            break;
-
-        case SWR_TYPE_UNORM:
-            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
-            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
-            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
-            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
-            break;
-
-        case SWR_TYPE_SNORM:
-            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
-            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
-            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
-            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
-            break;
-
-        case SWR_TYPE_UNKNOWN:
-            SWR_INVALID("Unsupported format type: %d", type);
-        }
-    }
-
-    void ApplyDefaults(SWR_FORMAT format, Value* src[4])
-    {
-        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-
-        bool valid[] = {false, false, false, false};
-        for (uint32_t c = 0; c < info.numComps; ++c)
-        {
-            valid[info.swizzle[c]] = true;
-        }
-
-        for (uint32_t c = 0; c < 4; ++c)
-        {
-            if (!valid[c])
-            {
-                src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
-            }
-        }
-    }
-
-    void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
-    {
-        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-
-        for (uint32_t c = 0; c < info.numComps; ++c)
-        {
-            if (info.type[c] == SWR_TYPE_UNUSED)
-            {
-                src[info.swizzle[c]] =
-                    BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
-            }
-        }
-    }
-
-    void Quantize(SWR_FORMAT format, Value* src[4])
-    {
-        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-        for (uint32_t c = 0; c < info.numComps; ++c)
-        {
-            if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
-            {
-                uint32_t swizComp = info.swizzle[c];
-                float    factor   = (float)((1 << info.bpc[c]) - 1);
-                switch (info.type[c])
-                {
-                case SWR_TYPE_UNORM:
-                    src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
-                    src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
-                    src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor));
-                    break;
-                default:
-                    SWR_INVALID("Unsupported format type: %d", info.type[c]);
-                }
-            }
-        }
-    }
-
-    template <bool Color, bool Alpha>
-    void BlendFunc(SWR_BLEND_OP blendOp,
-                   Value*       src[4],
-                   Value*       srcFactor[4],
-                   Value*       dst[4],
-                   Value*       dstFactor[4],
-                   Value*       result[4])
-    {
-        Value* out[4];
-        Value* srcBlend[4];
-        Value* dstBlend[4];
-        for (uint32_t i = 0; i < 4; ++i)
-        {
-            srcBlend[i] = FMUL(src[i], srcFactor[i]);
-            dstBlend[i] = FMUL(dst[i], dstFactor[i]);
-        }
-
-        switch (blendOp)
-        {
-        case BLENDOP_ADD:
-            out[0] = FADD(srcBlend[0], dstBlend[0]);
-            out[1] = FADD(srcBlend[1], dstBlend[1]);
-            out[2] = FADD(srcBlend[2], dstBlend[2]);
-            out[3] = FADD(srcBlend[3], dstBlend[3]);
-            break;
-
-        case BLENDOP_SUBTRACT:
-            out[0] = FSUB(srcBlend[0], dstBlend[0]);
-            out[1] = FSUB(srcBlend[1], dstBlend[1]);
-            out[2] = FSUB(srcBlend[2], dstBlend[2]);
-            out[3] = FSUB(srcBlend[3], dstBlend[3]);
-            break;
-
-        case BLENDOP_REVSUBTRACT:
-            out[0] = FSUB(dstBlend[0], srcBlend[0]);
-            out[1] = FSUB(dstBlend[1], srcBlend[1]);
-            out[2] = FSUB(dstBlend[2], srcBlend[2]);
-            out[3] = FSUB(dstBlend[3], srcBlend[3]);
-            break;
-
-        case BLENDOP_MIN:
-            out[0] = VMINPS(src[0], dst[0]);
-            out[1] = VMINPS(src[1], dst[1]);
-            out[2] = VMINPS(src[2], dst[2]);
-            out[3] = VMINPS(src[3], dst[3]);
-            break;
-
-        case BLENDOP_MAX:
-            out[0] = VMAXPS(src[0], dst[0]);
-            out[1] = VMAXPS(src[1], dst[1]);
-            out[2] = VMAXPS(src[2], dst[2]);
-            out[3] = VMAXPS(src[3], dst[3]);
-            break;
-
-        default:
-            SWR_INVALID("Unsupported blend operation: %d", blendOp);
-            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
-            break;
-        }
-
-        if (Color)
-        {
-            result[0] = out[0];
-            result[1] = out[1];
-            result[2] = out[2];
-        }
-
-        if (Alpha)
-        {
-            result[3] = out[3];
-        }
-    }
-
-    void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
-    {
-        // Op: (s == PS output, d = RT contents)
-        switch (logicOp)
-        {
-        case LOGICOP_CLEAR:
-            result[0] = VIMMED1(0);
-            result[1] = VIMMED1(0);
-            result[2] = VIMMED1(0);
-            result[3] = VIMMED1(0);
-            break;
-
-        case LOGICOP_NOR:
-            // ~(s | d)
-            result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
-            result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
-            result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
-            result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
-            break;
-
-        case LOGICOP_AND_INVERTED:
-            // ~s & d
-            // todo: use avx andnot instr when I can find the intrinsic to call
-            result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
-            result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
-            result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
-            result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
-            break;
-
-        case LOGICOP_COPY_INVERTED:
-            // ~s
-            result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
-            result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
-            result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
-            result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
-            break;
-
-        case LOGICOP_AND_REVERSE:
-            // s & ~d
-            // todo: use avx andnot instr when I can find the intrinsic to call
-            result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
-            result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
-            result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
-            result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
-            break;
-
-        case LOGICOP_INVERT:
-            // ~d
-            result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
-            result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
-            result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
-            result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
-            break;
-
-        case LOGICOP_XOR:
-            // s ^ d
-            result[0] = XOR(src[0], dst[0]);
-            result[1] = XOR(src[1], dst[1]);
-            result[2] = XOR(src[2], dst[2]);
-            result[3] = XOR(src[3], dst[3]);
-            break;
-
-        case LOGICOP_NAND:
-            // ~(s & d)
-            result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
-            result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
-            result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
-            result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
-            break;
-
-        case LOGICOP_AND:
-            // s & d
-            result[0] = AND(src[0], dst[0]);
-            result[1] = AND(src[1], dst[1]);
-            result[2] = AND(src[2], dst[2]);
-            result[3] = AND(src[3], dst[3]);
-            break;
-
-        case LOGICOP_EQUIV:
-            // ~(s ^ d)
-            result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
-            result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
-            result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
-            result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
-            break;
-
-        case LOGICOP_NOOP:
-            result[0] = dst[0];
-            result[1] = dst[1];
-            result[2] = dst[2];
-            result[3] = dst[3];
-            break;
-
-        case LOGICOP_OR_INVERTED:
-            // ~s | d
-            result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
-            result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
-            result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
-            result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
-            break;
-
-        case LOGICOP_COPY:
-            result[0] = src[0];
-            result[1] = src[1];
-            result[2] = src[2];
-            result[3] = src[3];
-            break;
-
-        case LOGICOP_OR_REVERSE:
-            // s | ~d
-            result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
-            result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
-            result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
-            result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
-            break;
-
-        case LOGICOP_OR:
-            // s | d
-            result[0] = OR(src[0], dst[0]);
-            result[1] = OR(src[1], dst[1]);
-            result[2] = OR(src[2], dst[2]);
-            result[3] = OR(src[3], dst[3]);
-            break;
-
-        case LOGICOP_SET:
-            result[0] = VIMMED1(0xFFFFFFFF);
-            result[1] = VIMMED1(0xFFFFFFFF);
-            result[2] = VIMMED1(0xFFFFFFFF);
-            result[3] = VIMMED1(0xFFFFFFFF);
-            break;
-
-        default:
-            SWR_INVALID("Unsupported logic operation: %d", logicOp);
-            result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
-            break;
-        }
-    }
-
-    void
-    AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
-    {
-        // load uint32_t reference
-        Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference}));
-
-        // load alpha
-        Value* pAlpha = LOAD(ppAlpha, {0, 0});
-
-        Value* pTest = nullptr;
-        if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
-        {
-            // convert float alpha to unorm8
-            Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
-            pAlphaU8        = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
-
-            // compare
-            switch (state.alphaTestFunction)
-            {
-            case ZFUNC_ALWAYS:
-                pTest = VIMMED1(true);
-                break;
-            case ZFUNC_NEVER:
-                pTest = VIMMED1(false);
-                break;
-            case ZFUNC_LT:
-                pTest = ICMP_ULT(pAlphaU8, pRef);
-                break;
-            case ZFUNC_EQ:
-                pTest = ICMP_EQ(pAlphaU8, pRef);
-                break;
-            case ZFUNC_LE:
-                pTest = ICMP_ULE(pAlphaU8, pRef);
-                break;
-            case ZFUNC_GT:
-                pTest = ICMP_UGT(pAlphaU8, pRef);
-                break;
-            case ZFUNC_NE:
-                pTest = ICMP_NE(pAlphaU8, pRef);
-                break;
-            case ZFUNC_GE:
-                pTest = ICMP_UGE(pAlphaU8, pRef);
-                break;
-            default:
-                SWR_INVALID("Invalid alpha test function");
-                break;
-            }
-        }
-        else
-        {
-            // cast ref to float
-            pRef = BITCAST(pRef, mSimdFP32Ty);
-
-            // compare
-            switch (state.alphaTestFunction)
-            {
-            case ZFUNC_ALWAYS:
-                pTest = VIMMED1(true);
-                break;
-            case ZFUNC_NEVER:
-                pTest = VIMMED1(false);
-                break;
-            case ZFUNC_LT:
-                pTest = FCMP_OLT(pAlpha, pRef);
-                break;
-            case ZFUNC_EQ:
-                pTest = FCMP_OEQ(pAlpha, pRef);
-                break;
-            case ZFUNC_LE:
-                pTest = FCMP_OLE(pAlpha, pRef);
-                break;
-            case ZFUNC_GT:
-                pTest = FCMP_OGT(pAlpha, pRef);
-                break;
-            case ZFUNC_NE:
-                pTest = FCMP_ONE(pAlpha, pRef);
-                break;
-            case ZFUNC_GE:
-                pTest = FCMP_OGE(pAlpha, pRef);
-                break;
-            default:
-                SWR_INVALID("Invalid alpha test function");
-                break;
-            }
-        }
-
-        // load current mask
-        Value* pMask = LOAD(ppMask);
-
-        // convert to int1 mask
-        pMask = MASK(pMask);
-
-        // and with alpha test result
-        pMask = AND(pMask, pTest);
-
-        // convert back to vector mask
-        pMask = VMASK(pMask);
-
-        // store new mask
-        STORE(pMask, ppMask);
-    }
-
-    Function* Create(const BLEND_COMPILE_STATE& state)
-    {
-        std::stringstream fnName("BLND_",
-                                 std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-        fnName << ComputeCRC(0, &state, sizeof(state));
-
-        // blend function signature
-        // typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*);
-
-        std::vector<Type*> args{
-            PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT*
-        };
-
-        // std::vector<Type*> args{
-        //    PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT*
-        //};
-
-        FunctionType* fTy       = FunctionType::get(IRB()->getVoidTy(), args, false);
-        Function*     blendFunc = Function::Create(
-            fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
-        blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
-
-        BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
-
-        IRB()->SetInsertPoint(entry);
-
-        // arguments
-        auto   argitr        = blendFunc->arg_begin();
-        Value* pBlendContext = &*argitr++;
-        pBlendContext->setName("pBlendContext");
-        Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState});
-        pBlendState->setName("pBlendState");
-        Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src});
-        pSrc->setName("src");
-        Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1});
-        pSrc1->setName("src1");
-        Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha});
-        pSrc0Alpha->setName("src0alpha");
-        Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum});
-        sampleNum->setName("sampleNum");
-        Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst});
-        pDst->setName("pDst");
-        Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result});
-        pResult->setName("result");
-        Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask});
-        ppoMask->setName("ppoMask");
-        Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask});
-        ppMask->setName("pMask");
-
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
-                      "Unsupported hot tile format");
-        Value* dst[4];
-        Value* constantColor[4];
-        Value* src[4];
-        Value* src1[4];
-        Value* result[4];
-        for (uint32_t i = 0; i < 4; ++i)
-        {
-            // load hot tile
-            dst[i] = LOAD(pDst, {0, i});
-
-            // load constant color
-            constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i}));
-
-            // load src
-            src[i] = LOAD(pSrc, {0, i});
-
-            // load src1
-            src1[i] = LOAD(pSrc1, {0, i});
-        }
-        Value* currentSampleMask = VIMMED1(-1);
-        if (state.desc.alphaToCoverageEnable)
-        {
-            Value*   pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
-            uint32_t bits        = (1 << state.desc.numSamples) - 1;
-            currentSampleMask    = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
-            currentSampleMask    = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
-        }
-
-        // alpha test
-        if (state.desc.alphaTestEnable)
-        {
-            // Gather for archrast stats
-            STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
-            AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
-        }
-        else
-        {
-            // Gather for archrast stats
-            STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
-        }
-
-        // color blend
-        if (state.blendState.blendEnable)
-        {
-            // Gather for archrast stats
-            STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
-
-            // clamp sources
-            Clamp(state.format, src);
-            Clamp(state.format, src1);
-            Clamp(state.format, dst);
-            Clamp(state.format, constantColor);
-
-            // apply defaults to hottile contents to take into account missing components
-            ApplyDefaults(state.format, dst);
-
-            // Force defaults for unused 'X' components
-            ApplyUnusedDefaults(state.format, dst);
-
-            // Quantize low precision components
-            Quantize(state.format, dst);
-
-            // special case clamping for R11G11B10_float which has no sign bit
-            if (state.format == R11G11B10_FLOAT)
-            {
-                dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
-                dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
-                dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
-                dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
-            }
-
-            Value* srcFactor[4];
-            Value* dstFactor[4];
-            if (state.desc.independentAlphaBlendEnable)
-            {
-                GenerateBlendFactor<true, false>(
-                    state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
-                GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor,
-                                                 constantColor,
-                                                 src,
-                                                 src1,
-                                                 dst,
-                                                 srcFactor);
-
-                GenerateBlendFactor<true, false>(
-                    state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
-                GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor,
-                                                 constantColor,
-                                                 src,
-                                                 src1,
-                                                 dst,
-                                                 dstFactor);
-
-                BlendFunc<true, false>(
-                    state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-                BlendFunc<false, true>(
-                    state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
-            }
-            else
-            {
-                GenerateBlendFactor<true, true>(
-                    state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
-                GenerateBlendFactor<true, true>(
-                    state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
-
-                BlendFunc<true, true>(
-                    state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-            }
-
-            // store results out
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                STORE(result[i], pResult, {0, i});
-            }
-        }
-        else
-        {
-            // Gather for archrast stats
-            STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
-        }
-
-        if (state.blendState.logicOpEnable)
-        {
-            const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
-            Value*                 vMask[4];
-            float                  scale[4];
-
-            if (!state.blendState.blendEnable)
-            {
-                Clamp(state.format, src);
-                Clamp(state.format, dst);
-            }
-
-            for (uint32_t i = 0; i < 4; i++)
-            {
-                if (info.type[i] == SWR_TYPE_UNUSED)
-                {
-                    continue;
-                }
-
-                if (info.bpc[i] >= 32)
-                {
-                    vMask[i] = VIMMED1(0xFFFFFFFF);
-                    scale[i] = 0xFFFFFFFF;
-                }
-                else
-                {
-                    vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
-                    if (info.type[i] == SWR_TYPE_SNORM)
-                        scale[i] = (1 << (info.bpc[i] - 1)) - 1;
-                    else
-                        scale[i] = (1 << info.bpc[i]) - 1;
-                }
-
-                switch (info.type[i])
-                {
-                default:
-                    SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
-                    break;
-
-                case SWR_TYPE_UNKNOWN:
-                case SWR_TYPE_UNUSED:
-                    FALLTHROUGH;
-
-                case SWR_TYPE_UINT:
-                case SWR_TYPE_SINT:
-                    src[i] = BITCAST(src[i], mSimdInt32Ty);
-                    dst[i] = BITCAST(dst[i], mSimdInt32Ty);
-                    break;
-                case SWR_TYPE_SNORM:
-                    src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
-                    dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
-                    break;
-                case SWR_TYPE_UNORM:
-                    src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
-                    dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
-                    break;
-                }
-            }
-
-            LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
-
-            // store results out
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                if (info.type[i] == SWR_TYPE_UNUSED)
-                {
-                    continue;
-                }
-
-                // clear upper bits from PS output not in RT format after doing logic op
-                result[i] = AND(result[i], vMask[i]);
-
-                switch (info.type[i])
-                {
-                default:
-                    SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
-                    break;
-
-                case SWR_TYPE_UNKNOWN:
-                case SWR_TYPE_UNUSED:
-                    FALLTHROUGH;
-
-                case SWR_TYPE_UINT:
-                case SWR_TYPE_SINT:
-                    result[i] = BITCAST(result[i], mSimdFP32Ty);
-                    break;
-                case SWR_TYPE_SNORM:
-                    result[i] = SHL(result[i], C(32 - info.bpc[i]));
-                    result[i] = ASHR(result[i], C(32 - info.bpc[i]));
-                    result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
-                    break;
-                case SWR_TYPE_UNORM:
-                    result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
-                    break;
-                }
-
-                STORE(result[i], pResult, {0, i});
-            }
-        }
-
-        if (state.desc.oMaskEnable)
-        {
-            assert(!(state.desc.alphaToCoverageEnable));
-            // load current mask
-            Value* oMask      = LOAD(ppoMask);
-            currentSampleMask = AND(oMask, currentSampleMask);
-        }
-
-        if (state.desc.sampleMaskEnable)
-        {
-            Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask});
-            currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
-        }
-
-        if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
-            state.desc.oMaskEnable)
-        {
-            // load coverage mask and mask off any lanes with no samples
-            Value* pMask        = LOAD(ppMask);
-            Value* sampleMasked = SHL(C(1), sampleNum);
-            currentSampleMask   = AND(currentSampleMask, VBROADCAST(sampleMasked));
-            currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);
-            Value* outputMask = AND(pMask, currentSampleMask);
-            // store new mask
-            STORE(outputMask, GEP(ppMask, C(0)));
-        }
-
-        RET_VOID();
-
-        JitManager::DumpToFile(blendFunc, "");
-
-        ::FunctionPassManager passes(JM()->mpCurrentModule);
-
-        passes.add(createBreakCriticalEdgesPass());
-        passes.add(createCFGSimplificationPass());
-        passes.add(createEarlyCSEPass());
-        passes.add(createPromoteMemoryToRegisterPass());
-        passes.add(createCFGSimplificationPass());
-        passes.add(createEarlyCSEPass());
-        passes.add(createInstructionCombiningPass());
-#if LLVM_VERSION_MAJOR <= 11
-        passes.add(createConstantPropagationPass());
-#endif
-        passes.add(createSCCPPass());
-        passes.add(createAggressiveDCEPass());
-
-        passes.add(createLowerX86Pass(this));
-
-        passes.run(*blendFunc);
-
-        JitManager::DumpToFile(blendFunc, "optimized");
-
-        return blendFunc;
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JITs from fetch shader IR
-/// @param hJitMgr - JitManager handle
-/// @param func   - LLVM function IR
-/// @return PFN_FETCH_FUNC - pointer to fetch code
-PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
-{
-    const llvm::Function* func    = (const llvm::Function*)hFunc;
-    JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-    PFN_BLEND_JIT_FUNC    pfnBlend;
-    pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
-    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
-    // add new IR to the module
-    pJitMgr->mIsModuleFinalized = true;
-
-    return pfnBlend;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JIT compiles blend shader
-/// @param hJitMgr - JitManager handle
-/// @param state   - blend state to build function from
-extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE                     hJitMgr,
-                                                      const BLEND_COMPILE_STATE& state)
-{
-    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-
-    pJitMgr->SetupNewModule();
-
-    BlendJit theJit(pJitMgr);
-    HANDLE   hFunc = theJit.Create(state);
-
-    return JitBlendFunc(hJitMgr, hFunc);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
deleted file mode 100644
index 3e78054eced..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file blend_jit.h
- *
- * @brief Definition of the blend jitter
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/formats.h"
-#include "core/state.h"
-
-struct RENDER_TARGET_BLEND_COMPILE_STATE
-{
-    bool             blendEnable;
-    bool             logicOpEnable;
-    SWR_BLEND_FACTOR sourceAlphaBlendFactor;
-    SWR_BLEND_FACTOR destAlphaBlendFactor;
-    SWR_BLEND_FACTOR sourceBlendFactor;
-    SWR_BLEND_FACTOR destBlendFactor;
-    SWR_BLEND_OP     colorBlendFunc;
-    SWR_BLEND_OP     alphaBlendFunc;
-    SWR_LOGIC_OP     logicOpFunc;
-};
-
-enum ALPHA_TEST_FORMAT
-{
-    ALPHA_TEST_UNORM8,
-    ALPHA_TEST_FLOAT32
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// BLEND_DESC
-//////////////////////////////////////////////////////////////////////////
-struct BLEND_DESC
-{
-    union
-    {
-        struct
-        {
-            uint32_t alphaTestEnable : 1;
-            uint32_t independentAlphaBlendEnable : 1;
-            uint32_t alphaToCoverageEnable : 1;
-            uint32_t oMaskEnable : 1;
-            uint32_t inputCoverageEnable : 1;
-            uint32_t sampleMaskEnable : 1;
-            uint32_t numSamples : 5;
-            uint32_t _reserved : 21;
-        };
-        uint32_t bits;
-    };
-};
-#define BLEND_ENABLE_MASK 0x3D // a2c | oMaskEnable | inputCoverageEnable | sampleMaskEnable
-//////////////////////////////////////////////////////////////////////////
-/// State required for blend jit
-//////////////////////////////////////////////////////////////////////////
-struct BLEND_COMPILE_STATE
-{
-    SWR_FORMAT                        format; // format of render target being blended
-    RENDER_TARGET_BLEND_COMPILE_STATE blendState;
-    BLEND_DESC                        desc;
-
-    SWR_ZFUNCTION     alphaTestFunction;
-    ALPHA_TEST_FORMAT alphaTestFormat;
-
-    bool operator==(const BLEND_COMPILE_STATE& other) const
-    {
-        return memcmp(this, &other, sizeof(BLEND_COMPILE_STATE)) == 0;
-    }
-
-    // Canonicalize state to reduce unnecessary JIT compiles
-    void Canonicalize()
-    {
-        if (!desc.alphaTestEnable)
-        {
-            alphaTestFormat   = (ALPHA_TEST_FORMAT)0;
-            alphaTestFunction = (SWR_ZFUNCTION)0;
-        }
-
-        if (!blendState.blendEnable)
-        {
-            blendState.sourceAlphaBlendFactor = (SWR_BLEND_FACTOR)0;
-            blendState.destAlphaBlendFactor   = (SWR_BLEND_FACTOR)0;
-            blendState.sourceBlendFactor      = (SWR_BLEND_FACTOR)0;
-            blendState.destBlendFactor        = (SWR_BLEND_FACTOR)0;
-            blendState.colorBlendFunc         = (SWR_BLEND_OP)0;
-            blendState.alphaBlendFunc         = (SWR_BLEND_OP)0;
-        }
-
-        if (!blendState.logicOpEnable)
-        {
-            blendState.logicOpFunc = (SWR_LOGIC_OP)0;
-        }
-
-        if (!blendState.blendEnable && !blendState.logicOpEnable)
-        {
-            format = (SWR_FORMAT)0;
-        }
-
-        if (!desc.independentAlphaBlendEnable)
-        {
-            blendState.sourceAlphaBlendFactor = (SWR_BLEND_FACTOR)0;
-            blendState.destAlphaBlendFactor   = (SWR_BLEND_FACTOR)0;
-            blendState.alphaBlendFunc         = (SWR_BLEND_OP)0;
-        }
-    }
-};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
deleted file mode 100644
index cd4b5f31ea3..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file builder.h
- *
- * @brief Includes all the builder related functionality
- *
- * Notes:
- *
- ******************************************************************************/
-
-#include "jit_pch.hpp"
-#include "builder.h"
-
-namespace SwrJit
-{
-    using namespace llvm;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Contructor for Builder.
-    /// @param pJitMgr - JitManager which contains modules, function passes, etc.
-    Builder::Builder(JitManager* pJitMgr) : mpJitMgr(pJitMgr), mpPrivateContext(nullptr)
-    {
-        mVWidth   = pJitMgr->mVWidth;
-        mVWidth16 = 16;
-
-        mpIRBuilder = &pJitMgr->mBuilder;
-
-        // Built in types: scalar
-
-        mVoidTy     = Type::getVoidTy(pJitMgr->mContext);
-        mFP16Ty     = Type::getHalfTy(pJitMgr->mContext);
-        mFP32Ty     = Type::getFloatTy(pJitMgr->mContext);
-        mFP32PtrTy  = PointerType::get(mFP32Ty, 0);
-        mDoubleTy   = Type::getDoubleTy(pJitMgr->mContext);
-        mInt1Ty     = Type::getInt1Ty(pJitMgr->mContext);
-        mInt8Ty     = Type::getInt8Ty(pJitMgr->mContext);
-        mInt16Ty    = Type::getInt16Ty(pJitMgr->mContext);
-        mInt32Ty    = Type::getInt32Ty(pJitMgr->mContext);
-        mInt64Ty    = Type::getInt64Ty(pJitMgr->mContext);
-        mInt8PtrTy  = PointerType::get(mInt8Ty, 0);
-        mInt16PtrTy = PointerType::get(mInt16Ty, 0);
-        mInt32PtrTy = PointerType::get(mInt32Ty, 0);
-        mInt64PtrTy = PointerType::get(mInt64Ty, 0);
-        mHandleTy   = mInt8PtrTy;
-
-        mSimd4FP64Ty = getVectorType(mDoubleTy, 4);
-
-        // Built in types: target simd
-        SetTargetWidth(pJitMgr->mVWidth);
-
-        // Built in types: simd16
-
-        mSimd16Int1Ty     = getVectorType(mInt1Ty, mVWidth16);
-        mSimd16Int16Ty    = getVectorType(mInt16Ty, mVWidth16);
-        mSimd16Int32Ty    = getVectorType(mInt32Ty, mVWidth16);
-        mSimd16Int64Ty    = getVectorType(mInt64Ty, mVWidth16);
-        mSimd16FP16Ty     = getVectorType(mFP16Ty, mVWidth16);
-        mSimd16FP32Ty     = getVectorType(mFP32Ty, mVWidth16);
-        mSimd16VectorTy   = ArrayType::get(mSimd16FP32Ty, 4);
-        mSimd16VectorTRTy = ArrayType::get(mSimd16FP32Ty, 5);
-
-        mSimd32Int8Ty = getVectorType(mInt8Ty, 32);
-
-        if (sizeof(uint32_t*) == 4)
-        {
-            mIntPtrTy       = mInt32Ty;
-            mSimdIntPtrTy   = mSimdInt32Ty;
-            mSimd16IntPtrTy = mSimd16Int32Ty;
-        }
-        else
-        {
-            SWR_ASSERT(sizeof(uint32_t*) == 8);
-
-            mIntPtrTy       = mInt64Ty;
-            mSimdIntPtrTy   = mSimdInt64Ty;
-            mSimd16IntPtrTy = mSimd16Int64Ty;
-        }
-    }
-
-    void Builder::SetTargetWidth(uint32_t width)
-    {
-        mVWidth = width;
-
-        mSimdInt1Ty      = getVectorType(mInt1Ty, mVWidth);
-        mSimdInt16Ty     = getVectorType(mInt16Ty, mVWidth);
-        mSimdInt32Ty     = getVectorType(mInt32Ty, mVWidth);
-        mSimdInt64Ty     = getVectorType(mInt64Ty, mVWidth);
-        mSimdFP16Ty      = getVectorType(mFP16Ty, mVWidth);
-        mSimdFP32Ty      = getVectorType(mFP32Ty, mVWidth);
-        mSimdVectorTy    = ArrayType::get(mSimdFP32Ty, 4);
-        mSimdVectorIntTy = ArrayType::get(mSimdInt32Ty, 4);
-        mSimdVectorTRTy  = ArrayType::get(mSimdFP32Ty, 5);
-        mSimdVectorTRIntTy  = ArrayType::get(mSimdInt32Ty, 5);
-    }
-
-    /// @brief Mark this alloca as temporary to avoid hoisting later on
-    void Builder::SetTempAlloca(Value* inst)
-    {
-        AllocaInst* pAlloca = dyn_cast<AllocaInst>(inst);
-        SWR_ASSERT(pAlloca, "Unexpected non-alloca instruction");
-        MDNode* N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, "is_temp_alloca"));
-        pAlloca->setMetadata("is_temp_alloca", N);
-    }
-
-    bool Builder::IsTempAlloca(Value* inst)
-    {
-        AllocaInst* pAlloca = dyn_cast<AllocaInst>(inst);
-        SWR_ASSERT(pAlloca, "Unexpected non-alloca instruction");
-
-        return (pAlloca->getMetadata("is_temp_alloca") != nullptr);
-    }
-
-    // Returns true if able to find a call instruction to mark
-    bool Builder::SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName)
-    {
-        CallInst* pCallInstr = dyn_cast<CallInst>(inst);
-        if (pCallInstr)
-        {
-            MDNode* N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, mdName));
-            pCallInstr->setMetadata(mdName, N);
-            return true;
-        }
-        else
-        {
-            // Follow use def chain back up
-            for (Use& u : inst->operands())
-            {
-                Instruction* srcInst = dyn_cast<Instruction>(u.get());
-                if (srcInst)
-                {
-                    if (SetNamedMetaDataOnCallInstr(srcInst, mdName))
-                    {
-                        return true;
-                    }
-                }
-            }
-        }
-
-        return false;
-    }
-
-    bool Builder::HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName)
-    {
-        CallInst* pCallInstr = dyn_cast<CallInst>(inst);
-
-        if (!pCallInstr)
-        {
-            return false;
-        }
-
-        return (pCallInstr->getMetadata(mdName) != nullptr);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Packetizes the type. Assumes SOA conversion.
-    Type* Builder::GetVectorType(Type* pType)
-    {
-        if (pType->isVectorTy())
-        {
-            return pType;
-        }
-
-        // [N x float] should packetize to [N x <8 x float>]
-        if (pType->isArrayTy())
-        {
-            uint32_t arraySize     = pType->getArrayNumElements();
-            Type*    pArrayType    = pType->getArrayElementType();
-            Type*    pVecArrayType = GetVectorType(pArrayType);
-            Type*    pVecType      = ArrayType::get(pVecArrayType, arraySize);
-            return pVecType;
-        }
-
-        // {float,int} should packetize to {<8 x float>, <8 x int>}
-        if (pType->isAggregateType())
-        {
-            uint32_t              numElems = pType->getStructNumElements();
-            SmallVector<Type*, 8> vecTypes;
-            for (uint32_t i = 0; i < numElems; ++i)
-            {
-                Type* pElemType    = pType->getStructElementType(i);
-                Type* pVecElemType = GetVectorType(pElemType);
-                vecTypes.push_back(pVecElemType);
-            }
-            Type* pVecType = StructType::get(JM()->mContext, vecTypes);
-            return pVecType;
-        }
-
-        // [N x float]* should packetize to [N x <8 x float>]*
-        if (pType->isPointerTy() && pType->getPointerElementType()->isArrayTy())
-        {
-            return PointerType::get(GetVectorType(pType->getPointerElementType()),
-                                    pType->getPointerAddressSpace());
-        }
-
-        // <ty> should packetize to <8 x <ty>>
-        Type* vecType = getVectorType(pType, JM()->mVWidth);
-        return vecType;
-    }
-} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
deleted file mode 100644
index 9f2c199464d..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file builder.h
- *
- * @brief Includes all the builder related functionality
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-#include "JitManager.h"
-#include "common/formats.h"
-
-namespace SwrJit
-{
-    ///@todo Move this to better place
-    enum SHADER_STATS_COUNTER_TYPE
-    {
-        STATS_INST_EXECUTED           = 0,
-        STATS_SAMPLE_EXECUTED         = 1,
-        STATS_SAMPLE_L_EXECUTED       = 2,
-        STATS_SAMPLE_B_EXECUTED       = 3,
-        STATS_SAMPLE_C_EXECUTED       = 4,
-        STATS_SAMPLE_C_LZ_EXECUTED    = 5,
-        STATS_SAMPLE_C_D_EXECUTED     = 6,
-        STATS_LOD_EXECUTED            = 7,
-        STATS_GATHER4_EXECUTED        = 8,
-        STATS_GATHER4_C_EXECUTED      = 9,
-        STATS_GATHER4_C_PO_EXECUTED   = 10,
-        STATS_GATHER4_C_PO_C_EXECUTED = 11,
-        STATS_LOAD_RAW_UAV            = 12,
-        STATS_LOAD_RAW_RESOURCE       = 13,
-        STATS_STORE_RAW_UAV           = 14,
-        STATS_STORE_TGSM              = 15,
-        STATS_DISCARD                 = 16,
-        STATS_BARRIER                 = 17,
-
-        // ------------------
-        STATS_TOTAL_COUNTERS
-    };
-
-    using namespace llvm;
-    struct Builder
-    {
-        Builder(JitManager* pJitMgr);
-        virtual ~Builder() {}
-
-        IRBuilder<>* IRB() { return mpIRBuilder; };
-        JitManager*  JM() { return mpJitMgr; }
-
-        JitManager*  mpJitMgr;
-        IRBuilder<>* mpIRBuilder;
-
-        uint32_t mVWidth;   // vector width target simd
-        uint32_t mVWidth16; // vector width simd16
-
-        // Built in types: scalar
-
-        Type* mVoidTy;
-        Type* mHandleTy;
-        Type* mInt1Ty;
-        Type* mInt8Ty;
-        Type* mInt16Ty;
-        Type* mInt32Ty;
-        Type* mInt64Ty;
-        Type* mIntPtrTy;
-        Type* mFP16Ty;
-        Type* mFP32Ty;
-        Type* mFP32PtrTy;
-        Type* mDoubleTy;
-        Type* mInt8PtrTy;
-        Type* mInt16PtrTy;
-        Type* mInt32PtrTy;
-        Type* mInt64PtrTy;
-
-        Type* mSimd4FP64Ty;
-
-        // Built in types: target SIMD
-
-        Type* mSimdFP16Ty;
-        Type* mSimdFP32Ty;
-        Type* mSimdInt1Ty;
-        Type* mSimdInt16Ty;
-        Type* mSimdInt32Ty;
-        Type* mSimdInt64Ty;
-        Type* mSimdIntPtrTy;
-        Type* mSimdVectorTy;
-        Type* mSimdVectorTRTy;
-        Type* mSimdVectorIntTy;
-        Type* mSimdVectorTRIntTy;
-
-        // Built in types: simd16
-
-        Type* mSimd16FP16Ty;
-        Type* mSimd16FP32Ty;
-        Type* mSimd16Int1Ty;
-        Type* mSimd16Int16Ty;
-        Type* mSimd16Int32Ty;
-        Type* mSimd16Int64Ty;
-        Type* mSimd16IntPtrTy;
-        Type* mSimd16VectorTy;
-        Type* mSimd16VectorTRTy;
-
-        Type* mSimd32Int8Ty;
-
-        void  SetTargetWidth(uint32_t width);
-        void  SetTempAlloca(Value* inst);
-        bool  IsTempAlloca(Value* inst);
-        bool  SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName);
-        bool  HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName);
-        Type* GetVectorType(Type* pType);
-        void  SetMetadata(StringRef s, uint32_t val)
-        {
-            llvm::NamedMDNode* metaData = mpJitMgr->mpCurrentModule->getOrInsertNamedMetadata(s);
-            Constant*          cval     = mpIRBuilder->getInt32(val);
-            llvm::MDNode*      mdNode   = llvm::MDNode::get(mpJitMgr->mpCurrentModule->getContext(),
-                                                     llvm::ConstantAsMetadata::get(cval));
-            if (metaData->getNumOperands())
-            {
-                metaData->setOperand(0, mdNode);
-            }
-            else
-            {
-                metaData->addOperand(mdNode);
-            }
-        }
-        uint32_t GetMetadata(StringRef s)
-        {
-            NamedMDNode* metaData = mpJitMgr->mpCurrentModule->getNamedMetadata(s);
-            if (metaData)
-            {
-                MDNode*   mdNode = metaData->getOperand(0);
-                Metadata* val    = mdNode->getOperand(0);
-                return mdconst::dyn_extract<ConstantInt>(val)->getZExtValue();
-            }
-            else
-            {
-                return 0;
-            }
-        }
-
-#include "gen_builder.hpp"
-#include "gen_builder_meta.hpp"
-#include "gen_builder_intrin.hpp"
-#include "builder_misc.h"
-#include "builder_math.h"
-#include "builder_mem.h"
-
-        void SetPrivateContext(Value* pPrivateContext)
-        {
-            mpPrivateContext = pPrivateContext;
-            NotifyPrivateContextSet();
-        }
-        virtual void  NotifyPrivateContextSet() {}
-        inline Value* GetPrivateContext() { return mpPrivateContext; }
-
-    private:
-        Value* mpPrivateContext;
-    };
-} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
deleted file mode 100644
index b67ffbfa7aa..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file builder_gfx_mem.cpp
- *
- * @brief Definition of the gfx mem builder
- *
- * Notes:
- *
- ******************************************************************************/
-#include "jit_pch.hpp"
-#include "builder.h"
-#include "common/rdtsc_buckets.h"
-#include "builder_gfx_mem.h"
-
-namespace SwrJit
-{
-    using namespace llvm;
-
-    BuilderGfxMem::BuilderGfxMem(JitManager* pJitMgr) : Builder(pJitMgr)
-    {
-        mpTranslationFuncTy             = nullptr;
-        mpfnTranslateGfxAddressForRead  = nullptr;
-        mpfnTranslateGfxAddressForWrite = nullptr;
-        mpfnTrackMemAccess              = nullptr;
-        mpParamSimDC                    = nullptr;
-        mpWorkerData                    = nullptr;
-
-    }
-
-    void BuilderGfxMem::NotifyPrivateContextSet()
-    {
-    }
-
-    void BuilderGfxMem::AssertGFXMemoryParams(Value* ptr, MEM_CLIENT usage)
-    {
-        SWR_ASSERT(!(ptr->getType() == mInt64Ty && usage == MEM_CLIENT::MEM_CLIENT_INTERNAL),
-                   "Internal memory should not be gfxptr_t.");
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value* BuilderGfxMem::GATHERPS(Value*         vSrc,
-                                   Value*         pBase,
-                                   Value*         vIndices,
-                                   Value*         vMask,
-                                   uint8_t        scale,
-                                   MEM_CLIENT     usage)
-    {
-       // address may be coming in as 64bit int now so get the pointer
-        if (pBase->getType() == mInt64Ty)
-        {
-            pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
-        }
-
-        Value* vGather = Builder::GATHERPS(vSrc, pBase, vIndices, vMask, scale);
-        return vGather;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value* BuilderGfxMem::GATHERDD(Value*         vSrc,
-                                   Value*         pBase,
-                                   Value*         vIndices,
-                                   Value*         vMask,
-                                   uint8_t        scale,
-                                   MEM_CLIENT     usage)
-    {
-
-        // address may be coming in as 64bit int now so get the pointer
-        if (pBase->getType() == mInt64Ty)
-        {
-            pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
-        }
-
-        Value* vGather = Builder::GATHERDD(vSrc, pBase, vIndices, vMask, scale);
-        return vGather;
-    }
-
-    void BuilderGfxMem::SCATTERPS(
-        Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage)
-    {
-
-        // address may be coming in as 64bit int now so get the pointer
-        if (pDst->getType() == mInt64Ty)
-        {
-            pDst = INT_TO_PTR(pDst, PointerType::get(mInt8Ty, 0));
-        }
-
-        Builder::SCATTERPS(pDst, BITCAST(vSrc, mSimdFP32Ty), vOffsets, vMask, usage);
-    }
-
-    Value* BuilderGfxMem::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
-    {
-        return ADD(base, offset);
-    }
-
-    Value* BuilderGfxMem::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name)
-    {
-        bool xlate = (Ptr->getType() == mInt64Ty);
-        if (xlate)
-        {
-            Ptr = INT_TO_PTR(Ptr, Ty);
-            Ptr = Builder::GEP(Ptr, Idx, nullptr, isReadOnly, Name);
-            Ptr = PTR_TO_INT(Ptr, mInt64Ty);
-            if (isReadOnly)
-            {
-                Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead);
-            }
-            else
-            {
-                Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForWrite);
-            }
-        }
-        else
-        {
-            Ptr = Builder::GEP(Ptr, Idx, nullptr, isReadOnly, Name);
-        }
-        return Ptr;
-    }
-
-    Value* BuilderGfxMem::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
-    {
-        bool xlate = (Ptr->getType() == mInt64Ty);
-        if (xlate)
-        {
-            Ptr = INT_TO_PTR(Ptr, Ty);
-            Ptr = Builder::GEP(Ty, Ptr, Idx, Name);
-            Ptr = PTR_TO_INT(Ptr, mInt64Ty);
-            Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead);
-        }
-        else
-        {
-            Ptr = Builder::GEP(Ty, Ptr, Idx, Name);
-        }
-        return Ptr;
-    }
-
-    Value* BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
-    {
-        bool xlate = (Ptr->getType() == mInt64Ty);
-        if (xlate)
-        {
-            Ptr = INT_TO_PTR(Ptr, Ty);
-            Ptr = Builder::GEP(Ptr, indexList);
-            Ptr = PTR_TO_INT(Ptr, mInt64Ty);
-            Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead);
-        }
-        else
-        {
-            Ptr = Builder::GEP(Ptr, indexList);
-        }
-        return Ptr;
-    }
-
-    Value*
-    BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
-    {
-        bool xlate = (Ptr->getType() == mInt64Ty);
-        if (xlate)
-        {
-            Ptr = INT_TO_PTR(Ptr, Ty);
-            Ptr = Builder::GEP(Ptr, indexList);
-            Ptr = PTR_TO_INT(Ptr, mInt64Ty);
-            Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead);
-        }
-        else
-        {
-            Ptr = Builder::GEP(Ptr, indexList);
-        }
-        return Ptr;
-    }
-
-    Value* BuilderGfxMem::TranslationHelper(Value* Ptr, Type* Ty, Value* pfnTranslateGfxAddress)
-    {
-        SWR_ASSERT(!(Ptr->getType() == mInt64Ty && Ty == nullptr),
-                   "Access of GFX pointers must have non-null type specified.");
-
-        // address may be coming in as 64bit int now so get the pointer
-        if (Ptr->getType() == mInt64Ty)
-        {
-            Ptr = INT_TO_PTR(Ptr, Ty);
-        }
-
-        return Ptr;
-    }
-
-    void BuilderGfxMem::TrackerHelper(Value* Ptr, Type* Ty, MEM_CLIENT usage, bool isRead)
-    {
-#if defined(KNOB_ENABLE_AR)
-        if (!KNOB_AR_ENABLE_MEMORY_EVENTS)
-        {
-            return;
-        }
-
-        Value* tmpPtr;
-        // convert actual pointers to int64.
-        uint32_t size = 0;
-
-        if (Ptr->getType() == mInt64Ty)
-        {
-            DataLayout dataLayout(JM()->mpCurrentModule);
-            size = (uint32_t)dataLayout.getTypeAllocSize(Ty);
-
-            tmpPtr = Ptr;
-        }
-        else
-        {
-            DataLayout dataLayout(JM()->mpCurrentModule);
-            size = (uint32_t)dataLayout.getTypeAllocSize(Ptr->getType());
-
-            tmpPtr = PTR_TO_INT(Ptr, mInt64Ty);
-        }
-
-        // There are some shader compile setups where there's no translation functions set up.
-        // This would be a situation where the accesses are to internal rasterizer memory and won't
-        // be logged.
-        // TODO:  we may wish to revisit this for URB reads/writes, though.
-        if (mpfnTrackMemAccess)
-        {
-            SWR_ASSERT(mpWorkerData != nullptr);
-            CALL(mpfnTrackMemAccess,
-                 {mpParamSimDC,
-                  mpWorkerData,
-                  tmpPtr,
-                  C((uint32_t)size),
-                  C((uint8_t)isRead),
-                  C((uint32_t)usage)});
-        }
-#endif
-
-        return;
-    }
-
-    LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage)
-    {
-        AssertGFXMemoryParams(Ptr, usage);
-        TrackerHelper(Ptr, Ty, usage, true);
-
-        Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead);
-        return Builder::LOAD(Ptr, Name);
-    }
-
-    LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage)
-    {
-        AssertGFXMemoryParams(Ptr, usage);
-        TrackerHelper(Ptr, Ty, usage, true);
-
-        Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead);
-        return Builder::LOAD(Ptr, Name);
-    }
-
-    LoadInst* BuilderGfxMem::LOAD(
-        Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage)
-    {
-        AssertGFXMemoryParams(Ptr, usage);
-        TrackerHelper(Ptr, Ty, usage, true);
-
-        Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead);
-        return Builder::LOAD(Ptr, isVolatile, Name);
-    }
-
-    LoadInst* BuilderGfxMem::LOAD(Value*                                 BasePtr,
-                                  const std::initializer_list<uint32_t>& offset,
-                                  const llvm::Twine&                     name,
-                                  Type*                                  Ty,
-                                  MEM_CLIENT                             usage)
-    {
-        AssertGFXMemoryParams(BasePtr, usage);
-
-        bool bNeedTranslation = false;
-        if (BasePtr->getType() == mInt64Ty)
-        {
-            SWR_ASSERT(Ty);
-            BasePtr          = INT_TO_PTR(BasePtr, Ty, name);
-            bNeedTranslation = true;
-        }
-        std::vector<Value*> valIndices;
-        for (auto i : offset)
-        {
-            valIndices.push_back(C(i));
-        }
-        BasePtr = Builder::GEPA(BasePtr, valIndices, name);
-        if (bNeedTranslation)
-        {
-            BasePtr = PTR_TO_INT(BasePtr, mInt64Ty, name);
-        }
-
-        return LOAD(BasePtr, name, Ty, usage);
-    }
-
-    CallInst* BuilderGfxMem::MASKED_LOAD(Value*         Ptr,
-                                         unsigned       Align,
-                                         Value*         Mask,
-                                         Value*         PassThru,
-                                         const Twine&   Name,
-                                         Type*          Ty,
-                                         MEM_CLIENT     usage)
-    {
-        AssertGFXMemoryParams(Ptr, usage);
-        TrackerHelper(Ptr, Ty, usage, true);
-
-        Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead);
-        return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage);
-    }
-
-    StoreInst*
-    BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, MEM_CLIENT usage)
-    {
-        AssertGFXMemoryParams(Ptr, usage);
-        TrackerHelper(Ptr, Ty, usage, false);
-
-        Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead);
-        return Builder::STORE(Val, Ptr, isVolatile, Ty, usage);
-    }
-
-    StoreInst* BuilderGfxMem::STORE(Value*                                 Val,
-                                    Value*                                 BasePtr,
-                                    const std::initializer_list<uint32_t>& offset,
-                                    Type*                                  Ty,
-                                    MEM_CLIENT                             usage)
-    {
-        AssertGFXMemoryParams(BasePtr, usage);
-        TrackerHelper(BasePtr, Ty, usage, false);
-
-        BasePtr = TranslationHelper(BasePtr, Ty, mpfnTranslateGfxAddressForRead);
-        return Builder::STORE(Val, BasePtr, offset, Ty, usage);
-    }
-
-    CallInst* BuilderGfxMem::MASKED_STORE(
-        Value* Val, Value* Ptr, unsigned Align, Value* Mask, Type* Ty, MEM_CLIENT usage)
-    {
-        AssertGFXMemoryParams(Ptr, usage);
-
-        TrackerHelper(Ptr, Ty, usage, false);
-
-        Ptr = TranslationHelper(Ptr, Ty, mpfnTranslateGfxAddressForRead);
-        return Builder::MASKED_STORE(Val, Ptr, Align, Mask, Ty, usage);
-    }
-
-    Value* BuilderGfxMem::TranslateGfxAddressForRead(Value*       xpGfxAddress,
-                                                     Type*        PtrTy,
-                                                     const Twine& Name,
-                                                     MEM_CLIENT /* usage */)
-    {
-        if (PtrTy == nullptr)
-        {
-            PtrTy = mInt8PtrTy;
-        }
-        return INT_TO_PTR(xpGfxAddress, PtrTy, Name);
-    }
-
-    Value* BuilderGfxMem::TranslateGfxAddressForWrite(Value*       xpGfxAddress,
-                                                      Type*        PtrTy,
-                                                      const Twine& Name,
-                                                      MEM_CLIENT /* usage */)
-    {
-        if (PtrTy == nullptr)
-        {
-            PtrTy = mInt8PtrTy;
-        }
-        return INT_TO_PTR(xpGfxAddress, PtrTy, Name);
-    }
-
-} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
deleted file mode 100644
index c361959b76f..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file builder_gfx_mem.h
- *
- * @brief Definition of the builder to support different translation types for gfx memory access
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-#include "builder.h"
-
-namespace SwrJit
-{
-    using namespace llvm;
-
-    class BuilderGfxMem : public Builder
-    {
-    public:
-        BuilderGfxMem(JitManager* pJitMgr);
-        virtual ~BuilderGfxMem() {}
-
-        virtual Value* GEP(Value* Ptr, Value* Idx, Type* Ty = nullptr, bool isReadOnly = true, const Twine& Name = "");
-        virtual Value* GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name = "");
-        virtual Value*
-        GEP(Value* Ptr, const std::initializer_list<Value*>& indexList, Type* Ty = nullptr);
-        virtual Value*
-        GEP(Value* Ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty = nullptr);
-
-        virtual LoadInst* LOAD(Value*         Ptr,
-                               const char*    Name,
-                               Type*          Ty    = nullptr,
-                               MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-        virtual LoadInst* LOAD(Value*         Ptr,
-                               const Twine&   Name  = "",
-                               Type*          Ty    = nullptr,
-                               MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-        virtual LoadInst* LOAD(Value*         Ptr,
-                               bool           isVolatile,
-                               const Twine&   Name  = "",
-                               Type*          Ty    = nullptr,
-                               MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-        virtual LoadInst* LOAD(Value*                                 BasePtr,
-                               const std::initializer_list<uint32_t>& offset,
-                               const llvm::Twine&                     Name  = "",
-                               Type*                                  Ty    = nullptr,
-                               MEM_CLIENT                         usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-        virtual CallInst* MASKED_LOAD(Value*         Ptr,
-                                      unsigned       Align,
-                                      Value*         Mask,
-                                      Value*         PassThru = nullptr,
-                                      const Twine&   Name     = "",
-                                      Type*          Ty       = nullptr,
-                                      MEM_CLIENT     usage    = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-        virtual StoreInst* STORE(Value *Val, Value *Ptr, bool isVolatile = false, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-        
-        virtual StoreInst* STORE(Value* Val, Value* BasePtr, const std::initializer_list<uint32_t>& offset, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-        virtual CallInst* MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-        virtual Value* GATHERPS(Value*         src,
-                                Value*         pBase,
-                                Value*         indices,
-                                Value*         mask,
-                                uint8_t        scale = 1,
-                                MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-        virtual Value* GATHERDD(Value*         src,
-                                Value*         pBase,
-                                Value*         indices,
-                                Value*         mask,
-                                uint8_t        scale = 1,
-                                MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-        virtual void SCATTERPS(Value*         pDst,
-                               Value*         vSrc,
-                               Value*         vOffsets,
-                               Value*         vMask,
-                               MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-        Value* TranslateGfxAddressForRead(Value*         xpGfxAddress,
-                                          Type*          PtrTy = nullptr,
-                                          const Twine&   Name  = "",
-                                          MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-        Value* TranslateGfxAddressForWrite(Value*         xpGfxAddress,
-                                           Type*          PtrTy = nullptr,
-                                           const Twine&   Name  = "",
-                                           MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-        
-    protected:
-        void AssertGFXMemoryParams(Value* ptr, MEM_CLIENT usage);
-
-        virtual void NotifyPrivateContextSet();
-
-        virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset);
-
-        Value* TranslationHelper(Value* Ptr, Type* Ty, Value* pfnTranslateGfxAddress);
-        void   TrackerHelper(Value* Ptr, Type* Ty, MEM_CLIENT usage, bool isRead);
-
-        FunctionType* GetTranslationFunctionType() { return mpTranslationFuncTy; }
-        Value*        GetTranslationFunctionForRead() { return mpfnTranslateGfxAddressForRead; }
-        Value*        GetTranslationFunctionForWrite() { return mpfnTranslateGfxAddressForWrite; }
-        Value*        GetParamSimDC() { return mpParamSimDC; }
-
-        Value*        mpWorkerData;
-
-    private:
-        FunctionType* mpTranslationFuncTy;
-        Value*        mpfnTranslateGfxAddressForRead;
-        Value*        mpfnTranslateGfxAddressForWrite;
-        Value*        mpParamSimDC;
-        Value*        mpfnTrackMemAccess;
-    };
-} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
deleted file mode 100644
index 02aa6f97cdf..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file builder_math.h
- *
- * @brief math/alu builder functions
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-Value* VLOG2PS(Value* src);
-Value* VPOW24PS(Value* src);
-Value* VEXP2PS(Value* src);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
deleted file mode 100644
index b5eb0a782b1..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ /dev/null
@@ -1,767 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file builder_misc.cpp
- *
- * @brief Implementation for miscellaneous builder functions
- *
- * Notes:
- *
- ******************************************************************************/
-#include "jit_pch.hpp"
-#include "builder.h"
-
-#include <cstdarg>
-
-namespace SwrJit
-{
-    void Builder::AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage)
-    {
-        SWR_ASSERT(
-            ptr->getType() != mInt64Ty,
-            "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
-    }
-
-    Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name)
-    {
-        return IRB()->CreateGEP(Ptr, Idx, Name);
-    }
-
-    Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
-    {
-        return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
-    }
-
-    Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(i);
-        return GEPA(ptr, indices);
-    }
-
-    Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(C(i));
-        return GEPA(ptr, indices);
-    }
-
-    Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
-    {
-        return IRB()->CreateGEP(Ptr, IdxList, Name);
-    }
-
-    Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
-    {
-        return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
-    }
-
-    Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(i);
-        return IN_BOUNDS_GEP(ptr, indices);
-    }
-
-    Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(C(i));
-        return IN_BOUNDS_GEP(ptr, indices);
-    }
-
-    LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage)
-    {
-        AssertMemoryUsageParams(Ptr, usage);
-        return IRB()->CreateLoad(Ptr, Name);
-    }
-
-    LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage)
-    {
-        AssertMemoryUsageParams(Ptr, usage);
-        return IRB()->CreateLoad(Ptr, Name);
-    }
-
-    LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, MEM_CLIENT usage)
-    {
-        AssertMemoryUsageParams(Ptr, usage);
-        return IRB()->CreateLoad(Ty, Ptr, Name);
-    }
-
-    LoadInst*
-    Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage)
-    {
-        AssertMemoryUsageParams(Ptr, usage);
-        return IRB()->CreateLoad(Ptr, isVolatile, Name);
-    }
-
-    LoadInst* Builder::LOAD(Value*                                 basePtr,
-                            const std::initializer_list<uint32_t>& indices,
-                            const llvm::Twine&                     name,
-                            Type*                                  Ty,
-                            MEM_CLIENT                             usage)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(C(i));
-        return Builder::LOAD(GEPA(basePtr, valIndices), name);
-    }
-
-    LoadInst* Builder::LOADV(Value*                               basePtr,
-                             const std::initializer_list<Value*>& indices,
-                             const llvm::Twine&                   name)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(i);
-        return LOAD(GEPA(basePtr, valIndices), name);
-    }
-
-    StoreInst*
-    Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, MEM_CLIENT usage)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(C(i));
-        return STORE(val, GEPA(basePtr, valIndices));
-    }
-
-    StoreInst*
-    Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(i);
-        return STORE(val, GEPA(basePtr, valIndices));
-    }
-
-    Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
-    {
-        return GEP(base, offset);
-    }
-
-    Value* Builder::MEM_ADD(Value*                                 i32Incr,
-                            Value*                                 basePtr,
-                            const std::initializer_list<uint32_t>& indices,
-                            const llvm::Twine&                     name)
-    {
-        Value* i32Value  = LOAD(GEP(basePtr, indices), name);
-        Value* i32Result = ADD(i32Value, i32Incr);
-        return STORE(i32Result, GEP(basePtr, indices));
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value* Builder::GATHERPS(Value*         vSrc,
-                             Value*         pBase,
-                             Value*         vIndices,
-                             Value*         vMask,
-                             uint8_t        scale,
-                             MEM_CLIENT     usage)
-    {
-        AssertMemoryUsageParams(pBase, usage);
-
-        return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value* Builder::GATHERDD(Value*         vSrc,
-                             Value*         pBase,
-                             Value*         vIndices,
-                             Value*         vMask,
-                             uint8_t        scale,
-                             MEM_CLIENT     usage)
-    {
-        AssertMemoryUsageParams(pBase, usage);
-
-        return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value*
-    Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
-    {
-        return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Alternative masked gather where source is a vector of pointers
-    /// @param pVecSrcPtr   - SIMD wide vector of pointers
-    /// @param pVecMask     - SIMD active lanes
-    /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
-    Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
-    {
-        return MASKED_GATHER(pVecSrcPtr, AlignType(4), pVecMask, pVecPassthru);
-    }
-
-    void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)
-    {
-        MASKED_SCATTER(pVecSrc, pVecDstPtr, AlignType(4), pVecMask);
-    }
-
-    void Builder::Gather4(const SWR_FORMAT format,
-                          Value*           pSrcBase,
-                          Value*           byteOffsets,
-                          Value*           mask,
-                          Value*           vGatherComponents[],
-                          bool             bPackedOutput,
-                          MEM_CLIENT       usage)
-    {
-        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-        if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
-        {
-            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
-        }
-        else
-        {
-            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
-        }
-    }
-
-    void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
-                            Value*                 pSrcBase,
-                            Value*                 byteOffsets,
-                            Value*                 vMask,
-                            Value*                 vGatherComponents[],
-                            bool                   bPackedOutput,
-                            MEM_CLIENT             usage)
-    {
-        switch (info.bpp / info.numComps)
-        {
-        case 16:
-        {
-            Value* vGatherResult[2];
-
-            // TODO: vGatherMaskedVal
-            Value* vGatherMaskedVal = VIMMED1((float)0);
-
-            // always have at least one component out of x or y to fetch
-
-            vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
-            // e.g. result of first 8x32bit integer gather for 16bit components
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-            //
-
-            // if we have at least one component out of x or y to fetch
-            if (info.numComps > 2)
-            {
-                // offset base to the next components(zw) in the vertex to gather
-                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
-
-                vGatherResult[1] =
-                    GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
-                // e.g. result of second 8x32bit integer gather for 16bit components
-                // 256i - 0    1    2    3    4    5    6    7
-                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
-                //
-            }
-            else
-            {
-                vGatherResult[1] = vGatherMaskedVal;
-            }
-
-            // Shuffle gathered components into place, each row is a component
-            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
-        }
-        break;
-        case 32:
-        {
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
-            }
-
-            for (uint32_t i = 0; i < info.numComps; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-
-                // Gather a SIMD of components
-                vGatherComponents[swizzleIndex] = GATHERPS(
-                    vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
-
-                // offset base to the next component to gather
-                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
-            }
-        }
-        break;
-        default:
-            SWR_INVALID("Invalid float format");
-            break;
-        }
-    }
-
-    void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
-                            Value*                 pSrcBase,
-                            Value*                 byteOffsets,
-                            Value*                 vMask,
-                            Value*                 vGatherComponents[],
-                            bool                   bPackedOutput,
-                            MEM_CLIENT             usage)
-    {
-        switch (info.bpp / info.numComps)
-        {
-        case 8:
-        {
-            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-            Value* vGatherResult =
-                GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
-            // e.g. result of an 8x32bit integer gather for 8bit components
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
-
-            Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
-        }
-        break;
-        case 16:
-        {
-            Value* vGatherResult[2];
-
-            // TODO: vGatherMaskedVal
-            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-
-            // always have at least one component out of x or y to fetch
-
-            vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
-            // e.g. result of first 8x32bit integer gather for 16bit components
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-            //
-
-            // if we have at least one component out of x or y to fetch
-            if (info.numComps > 2)
-            {
-                // offset base to the next components(zw) in the vertex to gather
-                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
-
-                vGatherResult[1] =
-                    GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
-                // e.g. result of second 8x32bit integer gather for 16bit components
-                // 256i - 0    1    2    3    4    5    6    7
-                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
-                //
-            }
-            else
-            {
-                vGatherResult[1] = vGatherMaskedVal;
-            }
-
-            // Shuffle gathered components into place, each row is a component
-            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
-        }
-        break;
-        case 32:
-        {
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
-            }
-
-            for (uint32_t i = 0; i < info.numComps; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-
-                // Gather a SIMD of components
-                vGatherComponents[swizzleIndex] = GATHERDD(
-                    vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
-
-                // offset base to the next component to gather
-                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
-            }
-        }
-        break;
-        default:
-            SWR_INVALID("unsupported format");
-            break;
-        }
-    }
-
-    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
-                                      Value*                 vGatherInput[2],
-                                      Value*                 vGatherOutput[4],
-                                      bool                   bPackedOutput)
-    {
-        // cast types
-        Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-        Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
-
-        // input could either be float or int vector; do shuffle work in int
-        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
-        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
-
-        if (bPackedOutput)
-        {
-            Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
-                                              mVWidth / 4); // vwidth is units of 32 bits
-
-            // shuffle mask
-            Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
-            Value* vShufResult =
-                BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
-            // after pshufb: group components together in each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-
-            Value* vi128XY =
-                BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-            // after PERMD: move and pack xy components into each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-
-            // do the same for zw components
-            Value* vi128ZW = nullptr;
-            if (info.numComps > 2)
-            {
-                Value* vShufResult =
-                    BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
-                vi128ZW =
-                    BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-            }
-
-            for (uint32_t i = 0; i < 4; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-                // todo: fixed for packed
-                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-                if (i >= info.numComps)
-                {
-                    // set the default component val
-                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-                    continue;
-                }
-
-                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
-                // extract packed component 128 bit lanes
-                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-            }
-        }
-        else
-        {
-            // pshufb masks for each component
-            Value* vConstMask[2];
-            // x/z shuffle mask
-            vConstMask[0] = C<char>({
-                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-            });
-
-            // y/w shuffle mask
-            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
-
-            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-            }
-
-            for (uint32_t i = 0; i < info.numComps; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-
-                // select correct constMask for x/z or y/w pshufb
-                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                uint32_t selectedGather = (i < 2) ? 0 : 1;
-
-                vGatherOutput[swizzleIndex] =
-                    BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
-                                   vConstMask[selectedMask]),
-                            vGatherTy);
-                // after pshufb mask for x channel; z uses the same shuffle from the second gather
-                // 256i - 0    1    2    3    4    5    6    7
-                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
-            }
-        }
-    }
-
-    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
-                                     Value*                 vGatherInput,
-                                     Value*                 vGatherOutput[],
-                                     bool                   bPackedOutput)
-    {
-        // cast types
-        Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-        Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
-
-        if (bPackedOutput)
-        {
-            Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
-                                           mVWidth / 4); // vwidth is units of 32 bits
-                                                         // shuffle mask
-            Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-                                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
-            Value* vShufResult =
-                BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-            // after pshufb: group components together in each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
-
-            Value* vi128XY =
-                BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
-            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
-
-            // do the same for zw components
-            Value* vi128ZW = nullptr;
-            if (info.numComps > 2)
-            {
-                vi128ZW =
-                    BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
-            }
-
-            // sign extend all enabled components. If we have a fill vVertexElements, output to
-            // current simdvertex
-            for (uint32_t i = 0; i < 4; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-                // todo: fix for packed
-                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-                if (i >= info.numComps)
-                {
-                    // set the default component val
-                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-                    continue;
-                }
-
-                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
-                // sign extend
-                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-            }
-        }
-        // else zero extend
-        else
-        {
-            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-            }
-
-            for (uint32_t i = 0; i < info.numComps; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-
-                // pshufb masks for each component
-                Value* vConstMask;
-                switch (i)
-                {
-                case 0:
-                    // x shuffle mask
-                    vConstMask =
-                        C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
-                                 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
-                    break;
-                case 1:
-                    // y shuffle mask
-                    vConstMask =
-                        C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
-                                 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
-                    break;
-                case 2:
-                    // z shuffle mask
-                    vConstMask =
-                        C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
-                                 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
-                    break;
-                case 3:
-                    // w shuffle mask
-                    vConstMask =
-                        C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
-                                 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
-                    break;
-                default:
-                    vConstMask = nullptr;
-                    break;
-                }
-
-                assert(vConstMask && "Invalid info.numComps value");
-                vGatherOutput[swizzleIndex] =
-                    BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-                // after pshufb for x channel
-                // 256i - 0    1    2    3    4    5    6    7
-                //        x000 x000 x000 x000 x000 x000 x000 x000
-            }
-        }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief emulates a scatter operation.
-    /// @param pDst - pointer to destination
-    /// @param vSrc - vector of src data to scatter
-    /// @param vOffsets - vector of byte offsets from pDst
-    /// @param vMask - mask of valid lanes
-    void Builder::SCATTERPS(
-        Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage)
-    {
-        AssertMemoryUsageParams(pDst, usage);
-#if LLVM_VERSION_MAJOR >= 11
-        SWR_ASSERT(cast<VectorType>(vSrc->getType())->getElementType()->isFloatTy());
-#else
-        SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());
-#endif
-        VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));
-        return;
-
-        /* Scatter algorithm
-
-        while(Index = BitScanForward(mask))
-        srcElem = srcVector[Index]
-        offsetElem = offsetVector[Index]
-        *(pDst + offsetElem) = srcElem
-        Update mask (&= ~(1<<Index)
-
-        */
-
-        /*
-
-        // Reference implementation kept around for reference
-
-        BasicBlock* pCurBB = IRB()->GetInsertBlock();
-        Function*   pFunc  = pCurBB->getParent();
-        Type*       pSrcTy = vSrc->getType()->getVectorElementType();
-
-        // Store vectors on stack
-        if (pScatterStackSrc == nullptr)
-        {
-            // Save off stack allocations and reuse per scatter. Significantly reduces stack
-            // requirements for shaders with a lot of scatters.
-            pScatterStackSrc     = CreateEntryAlloca(pFunc, mSimdInt64Ty);
-            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
-        }
-
-        Value* pSrcArrayPtr     = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
-        Value* pOffsetsArrayPtr = pScatterStackOffsets;
-        STORE(vSrc, pSrcArrayPtr);
-        STORE(vOffsets, pOffsetsArrayPtr);
-
-        // Cast to pointers for random access
-        pSrcArrayPtr     = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
-        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
-
-        Value* pMask = VMOVMSK(vMask);
-
-        // Setup loop basic block
-        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
-
-        // compute first set bit
-        Value* pIndex = CTTZ(pMask, C(false));
-
-        Value* pIsUndef = ICMP_EQ(pIndex, C(32));
-
-        // Split current block or create new one if building inline
-        BasicBlock* pPostLoop;
-        if (pCurBB->getTerminator())
-        {
-            pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
-
-            // Remove unconditional jump created by splitBasicBlock
-            pCurBB->getTerminator()->eraseFromParent();
-
-            // Add terminator to end of original block
-            IRB()->SetInsertPoint(pCurBB);
-
-            // Add conditional branch
-            COND_BR(pIsUndef, pPostLoop, pLoop);
-        }
-        else
-        {
-            pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
-
-            // Add conditional branch
-            COND_BR(pIsUndef, pPostLoop, pLoop);
-        }
-
-        // Add loop basic block contents
-        IRB()->SetInsertPoint(pLoop);
-        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
-        PHINode* pMaskPhi  = PHI(mInt32Ty, 2);
-
-        pIndexPhi->addIncoming(pIndex, pCurBB);
-        pMaskPhi->addIncoming(pMask, pCurBB);
-
-        // Extract elements for this index
-        Value* pSrcElem    = LOADV(pSrcArrayPtr, {pIndexPhi});
-        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
-
-        // GEP to this offset in dst
-        Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
-        pCurDst        = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
-        STORE(pSrcElem, pCurDst);
-
-        // Update the mask
-        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
-
-        // Terminator
-        Value* pNewIndex = CTTZ(pNewMask, C(false));
-
-        pIsUndef = ICMP_EQ(pNewIndex, C(32));
-        COND_BR(pIsUndef, pPostLoop, pLoop);
-
-        // Update phi edges
-        pIndexPhi->addIncoming(pNewIndex, pLoop);
-        pMaskPhi->addIncoming(pNewMask, pLoop);
-
-        // Move builder to beginning of post loop
-        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
-
-        */
-    }
-} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
deleted file mode 100644
index 429d5779a4d..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file builder_misc.h
- *
- * @brief miscellaneous builder functions
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-public:
-enum class MEM_CLIENT
-{
-    MEM_CLIENT_INTERNAL,
-    GFX_MEM_CLIENT_FETCH,
-    GFX_MEM_CLIENT_SAMPLER,
-    GFX_MEM_CLIENT_SHADER,
-    GFX_MEM_CLIENT_STREAMOUT,
-    GFX_MEM_CLIENT_URB
-};
-
-protected:
-virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset);
-void           AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage);
-
-public:
-virtual Value* GEP(Value* Ptr, Value* Idx, Type* Ty = nullptr, bool isReadOnly = true, const Twine& Name = "");
-virtual Value* GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name = "");
-virtual Value* GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty = nullptr);
-virtual Value*
-GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty = nullptr);
-
-Value* GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name = "");
-Value* GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name = "");
-
-Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList);
-Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList);
-
-virtual LoadInst*
-                  LOAD(Value* Ptr, const char* Name, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-virtual LoadInst* LOAD(Value*         Ptr,
-                       const Twine&   Name  = "",
-                       Type*          Ty    = nullptr,
-                       MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-virtual LoadInst*
-                  LOAD(Type* Ty, Value* Ptr, const Twine& Name = "", MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-virtual LoadInst* LOAD(Value*         Ptr,
-                       bool           isVolatile,
-                       const Twine&   Name  = "",
-                       Type*          Ty    = nullptr,
-                       MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-virtual LoadInst* LOAD(Value*                                 BasePtr,
-                       const std::initializer_list<uint32_t>& offset,
-                       const llvm::Twine&                     Name  = "",
-                       Type*                                  Ty    = nullptr,
-                       MEM_CLIENT                             usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-virtual CallInst* MASKED_LOAD(Value*         Ptr,
-                              unsigned       Align,
-                              Value*         Mask,
-                              Value*         PassThru = nullptr,
-                              const Twine&   Name     = "",
-                              Type*          Ty       = nullptr,
-                              MEM_CLIENT usage    = MEM_CLIENT::MEM_CLIENT_INTERNAL)
-{
-    return IRB()->CreateMaskedLoad(Ptr, AlignType(Align), Mask, PassThru, Name);
-}
-
-virtual StoreInst* STORE(Value *Val, Value *Ptr, bool isVolatile = false, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL)
-{
-    return IRB()->CreateStore(Val, Ptr, isVolatile);
-}
-
-virtual StoreInst* STORE(Value* Val, Value* BasePtr, const std::initializer_list<uint32_t>& offset, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-virtual CallInst* MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask, Type* Ty = nullptr, MEM_CLIENT usage = MEM_CLIENT::MEM_CLIENT_INTERNAL)
-{
-    return IRB()->CreateMaskedStore(Val, Ptr, AlignType(Align), Mask);
-}
-
-LoadInst*  LOADV(Value* BasePtr, const std::initializer_list<Value*>& offset, const llvm::Twine& name = "");
-StoreInst* STOREV(Value* Val, Value* BasePtr, const std::initializer_list<Value*>& offset);
-
-Value* MEM_ADD(Value*                                 i32Incr,
-               Value*                                 basePtr,
-               const std::initializer_list<uint32_t>& indices,
-               const llvm::Twine&                     name = "");
-
-void Gather4(const SWR_FORMAT format,
-             Value*           pSrcBase,
-             Value*           byteOffsets,
-             Value*           mask,
-             Value*           vGatherComponents[],
-             bool             bPackedOutput,
-             MEM_CLIENT       usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-virtual Value* GATHERPS(Value*         src,
-                        Value*         pBase,
-                        Value*         indices,
-                        Value*         mask,
-                        uint8_t        scale = 1,
-                        MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-void GATHER4PS(const SWR_FORMAT_INFO& info,
-               Value*                 pSrcBase,
-               Value*                 byteOffsets,
-               Value*                 mask,
-               Value*                 vGatherComponents[],
-               bool                   bPackedOutput,
-               MEM_CLIENT             usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-virtual Value* GATHERDD(Value*         src,
-                        Value*         pBase,
-                        Value*         indices,
-                        Value*         mask,
-                        uint8_t        scale = 1,
-                        MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-void GATHER4DD(const SWR_FORMAT_INFO& info,
-               Value*                 pSrcBase,
-               Value*                 byteOffsets,
-               Value*                 mask,
-               Value*                 vGatherComponents[],
-               bool                   bPackedOutput,
-               MEM_CLIENT             usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-Value* GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
-
-Value* GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru);
-void SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask);
-
-virtual void SCATTERPS(Value*         pDst,
-                       Value*         vSrc,
-                       Value*         vOffsets,
-                       Value*         vMask,
-                       MEM_CLIENT     usage = MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
-void Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
-                        Value*                 vGatherInput,
-                        Value*                 vGatherOutput[],
-                        bool                   bPackedOutput);
-void Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
-                         Value*                 vGatherInput[],
-                         Value*                 vGatherOutput[],
-                         bool                   bPackedOutput);
-
-// Static stack allocations for scatter operations
-Value* pScatterStackSrc{nullptr};
-Value* pScatterStackOffsets{nullptr};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
deleted file mode 100644
index 8080a40a1f9..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ /dev/null
@@ -1,1125 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file builder_misc.cpp
- *
- * @brief Implementation for miscellaneous builder functions
- *
- * Notes:
- *
- ******************************************************************************/
-#include "jit_pch.hpp"
-#include "builder.h"
-#include "common/rdtsc_buckets.h"
-
-#include <cstdarg>
-
-extern "C" void CallPrint(const char* fmt, ...);
-
-namespace SwrJit
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Convert an IEEE 754 32-bit single precision float to an
-    ///        16 bit float with 5 exponent bits and a variable
-    ///        number of mantissa bits.
-    /// @param val - 32-bit float
-    /// @todo Maybe move this outside of this file into a header?
-    static uint16_t ConvertFloat32ToFloat16(float val)
-    {
-        uint32_t sign, exp, mant;
-        uint32_t roundBits;
-
-        // Extract the sign, exponent, and mantissa
-        uint32_t uf = *(uint32_t*)&val;
-        sign        = (uf & 0x80000000) >> 31;
-        exp         = (uf & 0x7F800000) >> 23;
-        mant        = uf & 0x007FFFFF;
-
-        // Check for out of range
-        if (std::isnan(val))
-        {
-            exp  = 0x1F;
-            mant = 0x200;
-            sign = 1; // set the sign bit for NANs
-        }
-        else if (std::isinf(val))
-        {
-            exp  = 0x1f;
-            mant = 0x0;
-        }
-        else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
-        {
-            exp  = 0x1E;
-            mant = 0x3FF;
-        }
-        else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
-        {
-            mant |= 0x00800000;
-            for (; exp <= 0x70; mant >>= 1, exp++)
-                ;
-            exp  = 0;
-            mant = mant >> 13;
-        }
-        else if (exp < 0x66) // Too small to represent -> Zero
-        {
-            exp  = 0;
-            mant = 0;
-        }
-        else
-        {
-            // Saves bits that will be shifted off for rounding
-            roundBits = mant & 0x1FFFu;
-            // convert exponent and mantissa to 16 bit format
-            exp  = exp - 0x70;
-            mant = mant >> 13;
-
-            // Essentially RTZ, but round up if off by only 1 lsb
-            if (roundBits == 0x1FFFu)
-            {
-                mant++;
-                // check for overflow
-                if ((mant & 0xC00u) != 0)
-                    exp++;
-                // make sure only the needed bits are used
-                mant &= 0x3FF;
-            }
-        }
-
-        uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
-        return (uint16_t)tmpVal;
-    }
-
-    Constant* Builder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); }
-
-    Constant* Builder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
-
-    Constant* Builder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
-
-    Constant* Builder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
-
-    Constant* Builder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
-
-    Constant* Builder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); }
-
-    Constant* Builder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
-
-    Constant* Builder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
-
-    Constant* Builder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); }
-
-    Constant* Builder::PRED(bool pred)
-    {
-        return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
-    }
-
-    Value* Builder::VIMMED1(uint64_t i)
-    {
-#if LLVM_VERSION_MAJOR <= 10
-        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
-#elif LLVM_VERSION_MAJOR == 11
-        return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
-#else
-        return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
-#endif
-    }
-
-    Value* Builder::VIMMED1_16(uint64_t i)
-    {
-#if LLVM_VERSION_MAJOR <= 10
-        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
-#elif LLVM_VERSION_MAJOR == 11
-        return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
-#else
-        return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
-#endif
-    }
-
-    Value* Builder::VIMMED1(int i)
-    {
-#if LLVM_VERSION_MAJOR <= 10
-        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
-#elif LLVM_VERSION_MAJOR == 11
-        return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
-#else
-        return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
-#endif
-    }
-
-    Value* Builder::VIMMED1_16(int i)
-    {
-#if LLVM_VERSION_MAJOR <= 10
-        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
-#elif LLVM_VERSION_MAJOR == 11
-        return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
-#else
-        return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
-#endif
-    }
-
-    Value* Builder::VIMMED1(uint32_t i)
-    {
-#if LLVM_VERSION_MAJOR <= 10
-        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
-#elif LLVM_VERSION_MAJOR == 11
-        return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
-#else
-        return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
-#endif
-    }
-
-    Value* Builder::VIMMED1_16(uint32_t i)
-    {
-#if LLVM_VERSION_MAJOR <= 10
-        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
-#elif LLVM_VERSION_MAJOR == 11
-        return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
-#else
-        return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
-#endif
-    }
-
-    Value* Builder::VIMMED1(float i)
-    {
-#if LLVM_VERSION_MAJOR <= 10
-        return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
-#elif LLVM_VERSION_MAJOR == 11
-        return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantFP>(C(i)));
-#else
-        return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantFP>(C(i)));
-#endif
-    }
-
-    Value* Builder::VIMMED1_16(float i)
-    {
-#if LLVM_VERSION_MAJOR <= 10
-        return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
-#elif LLVM_VERSION_MAJOR == 11
-        return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantFP>(C(i)));
-#else
-        return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantFP>(C(i)));
-#endif
-    }
-
-    Value* Builder::VIMMED1(bool i)
-    {
-#if LLVM_VERSION_MAJOR <= 10
-        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
-#elif LLVM_VERSION_MAJOR == 11
-        return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
-#else
-        return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
-#endif
-    }
-
-    Value* Builder::VIMMED1_16(bool i)
-    {
-#if LLVM_VERSION_MAJOR <= 10
-        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
-#elif LLVM_VERSION_MAJOR == 11
-        return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
-#else
-        return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
-#endif
-    }
-
-    Value* Builder::VUNDEF_IPTR() { return UndefValue::get(getVectorType(mInt32PtrTy, mVWidth)); }
-
-    Value* Builder::VUNDEF(Type* t) { return UndefValue::get(getVectorType(t, mVWidth)); }
-
-    Value* Builder::VUNDEF_I() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth)); }
-
-    Value* Builder::VUNDEF_I_16() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth16)); }
-
-    Value* Builder::VUNDEF_F() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth)); }
-
-    Value* Builder::VUNDEF_F_16() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth16)); }
-
-    Value* Builder::VUNDEF(Type* ty, uint32_t size)
-    {
-        return UndefValue::get(getVectorType(ty, size));
-    }
-
-    Value* Builder::VBROADCAST(Value* src, const llvm::Twine& name)
-    {
-        // check if src is already a vector
-        if (src->getType()->isVectorTy())
-        {
-            return src;
-        }
-
-        return VECTOR_SPLAT(mVWidth, src, name);
-    }
-
-    Value* Builder::VBROADCAST_16(Value* src)
-    {
-        // check if src is already a vector
-        if (src->getType()->isVectorTy())
-        {
-            return src;
-        }
-
-        return VECTOR_SPLAT(mVWidth16, src);
-    }
-
-    uint32_t Builder::IMMED(Value* v)
-    {
-        SWR_ASSERT(isa<ConstantInt>(v));
-        ConstantInt* pValConst = cast<ConstantInt>(v);
-        return pValConst->getZExtValue();
-    }
-
-    int32_t Builder::S_IMMED(Value* v)
-    {
-        SWR_ASSERT(isa<ConstantInt>(v));
-        ConstantInt* pValConst = cast<ConstantInt>(v);
-        return pValConst->getSExtValue();
-    }
-
-    CallInst* Builder::CALL(Value*                               Callee,
-                            const std::initializer_list<Value*>& argsList,
-                            const llvm::Twine&                   name)
-    {
-        std::vector<Value*> args;
-        for (auto arg : argsList)
-            args.push_back(arg);
-#if LLVM_VERSION_MAJOR >= 11
-        // see comment to CALLA(Callee) function in the header
-        return CALLA(FunctionCallee(cast<Function>(Callee)), args, name);
-#else
-        return CALLA(Callee, args, name);
-#endif
-    }
-
-    CallInst* Builder::CALL(Value* Callee, Value* arg)
-    {
-        std::vector<Value*> args;
-        args.push_back(arg);
-#if LLVM_VERSION_MAJOR >= 11
-        // see comment to CALLA(Callee) function in the header
-        return CALLA(FunctionCallee(cast<Function>(Callee)), args);
-#else
-        return CALLA(Callee, args);
-#endif
-    }
-
-    CallInst* Builder::CALL2(Value* Callee, Value* arg1, Value* arg2)
-    {
-        std::vector<Value*> args;
-        args.push_back(arg1);
-        args.push_back(arg2);
-#if LLVM_VERSION_MAJOR >= 11
-        // see comment to CALLA(Callee) function in the header
-        return CALLA(FunctionCallee(cast<Function>(Callee)), args);
-#else
-        return CALLA(Callee, args);
-#endif
-    }
-
-    CallInst* Builder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3)
-    {
-        std::vector<Value*> args;
-        args.push_back(arg1);
-        args.push_back(arg2);
-        args.push_back(arg3);
-#if LLVM_VERSION_MAJOR >= 11
-        // see comment to CALLA(Callee) function in the header
-        return CALLA(FunctionCallee(cast<Function>(Callee)), args);
-#else
-        return CALLA(Callee, args);
-#endif
-    }
-
-    Value* Builder::VRCP(Value* va, const llvm::Twine& name)
-    {
-        return FDIV(VIMMED1(1.0f), va, name); // 1 / a
-    }
-
-    Value* Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY)
-    {
-        Value* vOut = FMADDPS(vA, vX, vC);
-        vOut        = FMADDPS(vB, vY, vOut);
-        return vOut;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief insert a JIT call to CallPrint
-    /// - outputs formatted string to both stdout and VS output window
-    /// - DEBUG builds only
-    /// Usage example:
-    ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
-    ///   where C(lane) creates a constant value to print, and pIndex is the Value*
-    ///   result from a GEP, printing out the pointer to memory
-    /// @param printStr - constant string to print, which includes format specifiers
-    /// @param printArgs - initializer list of Value*'s to print to std out
-    CallInst* Builder::PRINT(const std::string&                   printStr,
-                             const std::initializer_list<Value*>& printArgs)
-    {
-        // push the arguments to CallPrint into a vector
-        std::vector<Value*> printCallArgs;
-        // save room for the format string.  we still need to modify it for vectors
-        printCallArgs.resize(1);
-
-        // search through the format string for special processing
-        size_t      pos = 0;
-        std::string tempStr(printStr);
-        pos    = tempStr.find('%', pos);
-        auto v = printArgs.begin();
-
-        while ((pos != std::string::npos) && (v != printArgs.end()))
-        {
-            Value* pArg  = *v;
-            Type*  pType = pArg->getType();
-
-            if (pType->isVectorTy())
-            {
-                Type* pContainedType = pType->getContainedType(0);
-#if LLVM_VERSION_MAJOR >= 12
-                FixedVectorType* pVectorType = cast<FixedVectorType>(pType);
-#elif LLVM_VERSION_MAJOR >= 11
-                VectorType* pVectorType = cast<VectorType>(pType);
-#endif
-                if (toupper(tempStr[pos + 1]) == 'X')
-                {
-                    tempStr[pos]     = '0';
-                    tempStr[pos + 1] = 'x';
-                    tempStr.insert(pos + 2, "%08X ");
-                    pos += 7;
-
-                    printCallArgs.push_back(VEXTRACT(pArg, C(0)));
-
-                    std::string vectorFormatStr;
-#if LLVM_VERSION_MAJOR >= 11
-                    for (uint32_t i = 1; i < pVectorType->getNumElements(); ++i)
-#else
-                    for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
-#endif
-                    {
-                        vectorFormatStr += "0x%08X ";
-                        printCallArgs.push_back(VEXTRACT(pArg, C(i)));
-                    }
-
-                    tempStr.insert(pos, vectorFormatStr);
-                    pos += vectorFormatStr.size();
-                }
-                else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
-                {
-                    uint32_t i = 0;
-#if LLVM_VERSION_MAJOR >= 11
-                    for (; i < pVectorType->getNumElements() - 1; i++)
-#else
-                    for (; i < pType->getVectorNumElements() - 1; i++)
-#endif
-                    {
-                        tempStr.insert(pos, std::string("%f "));
-                        pos += 3;
-                        printCallArgs.push_back(
-                            FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
-                    }
-                    printCallArgs.push_back(
-                        FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
-                }
-                else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
-                {
-                    uint32_t i = 0;
-#if LLVM_VERSION_MAJOR >= 11
-                    for (; i < pVectorType->getNumElements() - 1; i++)
-#else
-                    for (; i < pType->getVectorNumElements() - 1; i++)
-#endif
-                    {
-                        tempStr.insert(pos, std::string("%d "));
-                        pos += 3;
-                        printCallArgs.push_back(
-                            S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
-                    }
-                    printCallArgs.push_back(
-                        S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
-                }
-                else if ((tempStr[pos + 1] == 'u') && (pContainedType->isIntegerTy()))
-                {
-                    uint32_t i = 0;
-#if LLVM_VERSION_MAJOR >= 11
-                    for (; i < pVectorType->getNumElements() - 1; i++)
-#else
-                    for (; i < pType->getVectorNumElements() - 1; i++)
-#endif
-                    {
-                        tempStr.insert(pos, std::string("%d "));
-                        pos += 3;
-                        printCallArgs.push_back(
-                            Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
-                    }
-                    printCallArgs.push_back(
-                        Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
-                }
-            }
-            else
-            {
-                if (toupper(tempStr[pos + 1]) == 'X')
-                {
-                    tempStr[pos] = '0';
-                    tempStr.insert(pos + 1, "x%08");
-                    printCallArgs.push_back(pArg);
-                    pos += 3;
-                }
-                // for %f we need to cast float Values to doubles so that they print out correctly
-                else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
-                {
-                    printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
-                    pos++;
-                }
-                else
-                {
-                    printCallArgs.push_back(pArg);
-                }
-            }
-
-            // advance to the next argument
-            v++;
-            pos = tempStr.find('%', ++pos);
-        }
-
-        // create global variable constant string
-        Constant*       constString = ConstantDataArray::getString(JM()->mContext, tempStr, true);
-        GlobalVariable* gvPtr       = new GlobalVariable(
-            constString->getType(), true, GlobalValue::InternalLinkage, constString, "printStr");
-        JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
-
-        // get a pointer to the first character in the constant string array
-        std::vector<Constant*> geplist{C(0), C(0)};
-        Constant* strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr, geplist, false);
-
-        // insert the pointer to the format string in the argument vector
-        printCallArgs[0] = strGEP;
-
-        // get pointer to CallPrint function and insert decl into the module if needed
-        std::vector<Type*> args;
-        args.push_back(PointerType::get(mInt8Ty, 0));
-        FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, true);
-        Function*     callPrintFn =
-#if LLVM_VERSION_MAJOR >= 9
-            cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy).getCallee());
-#else
-            cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
-#endif
-
-        // if we haven't yet added the symbol to the symbol table
-        if ((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
-        {
-            sys::DynamicLibrary::AddSymbol("CallPrint", (void*)&CallPrint);
-        }
-
-        // insert a call to CallPrint
-        return CALLA(callPrintFn, printCallArgs);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Wrapper around PRINT with initializer list.
-    CallInst* Builder::PRINT(const std::string& printStr) { return PRINT(printStr, {}); }
-
-    Value* Builder::EXTRACT_16(Value* x, uint32_t imm)
-    {
-        if (imm == 0)
-        {
-            return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7});
-        }
-        else
-        {
-            return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15});
-        }
-    }
-
-    Value* Builder::JOIN_16(Value* a, Value* b)
-    {
-        return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
-    Value* Builder::MASK(Value* vmask)
-    {
-        Value* src = BITCAST(vmask, mSimdInt32Ty);
-        return ICMP_SLT(src, VIMMED1(0));
-    }
-
-    Value* Builder::MASK_16(Value* vmask)
-    {
-        Value* src = BITCAST(vmask, mSimd16Int32Ty);
-        return ICMP_SLT(src, VIMMED1_16(0));
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
-    Value* Builder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); }
-
-    Value* Builder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); }
-
-    /// @brief Convert <Nxi1> llvm mask to integer
-    Value* Builder::VMOVMSK(Value* mask)
-    {
-#if LLVM_VERSION_MAJOR >= 11
-#if LLVM_VERSION_MAJOR >= 12
-        FixedVectorType* pVectorType = cast<FixedVectorType>(mask->getType());
-#else
-        VectorType* pVectorType = cast<VectorType>(mask->getType());
-#endif
-        SWR_ASSERT(pVectorType->getElementType() == mInt1Ty);
-        uint32_t numLanes = pVectorType->getNumElements();
-#else
-        SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty);
-        uint32_t numLanes = mask->getType()->getVectorNumElements();
-#endif
-        Value*   i32Result;
-        if (numLanes == 8)
-        {
-            i32Result = BITCAST(mask, mInt8Ty);
-        }
-        else if (numLanes == 16)
-        {
-            i32Result = BITCAST(mask, mInt16Ty);
-        }
-        else
-        {
-            SWR_ASSERT("Unsupported vector width");
-            i32Result = BITCAST(mask, mInt8Ty);
-        }
-        return Z_EXT(i32Result, mInt32Ty);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
-    /// supported on the underlying platform, emulate it
-    /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
-    /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
-    /// Byte masks in lower 128 lane of b selects 8 bit values from lower
-    /// 128bits of a, and vice versa for the upper lanes.  If the mask
-    /// value is negative, '0' is inserted.
-    Value* Builder::PSHUFB(Value* a, Value* b)
-    {
-        Value* res;
-        // use avx2 pshufb instruction if available
-        if (JM()->mArch.AVX2())
-        {
-            res = VPSHUFB(a, b);
-        }
-        else
-        {
-            Constant* cB = dyn_cast<Constant>(b);
-            assert(cB != nullptr);
-            // number of 8 bit elements in b
-#if LLVM_VERSION_MAJOR >= 12
-            uint32_t numElms = cast<FixedVectorType>(cB->getType())->getNumElements();
-#else
-            uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
-#endif
-            // output vector
-            Value* vShuf = UndefValue::get(getVectorType(mInt8Ty, numElms));
-
-            // insert an 8 bit value from the high and low lanes of a per loop iteration
-            numElms /= 2;
-            for (uint32_t i = 0; i < numElms; i++)
-            {
-                ConstantInt* cLow128b  = cast<ConstantInt>(cB->getAggregateElement(i));
-                ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
-
-                // extract values from constant mask
-                char valLow128bLane  = (char)(cLow128b->getSExtValue());
-                char valHigh128bLane = (char)(cHigh128b->getSExtValue());
-
-                Value* insertValLow128b;
-                Value* insertValHigh128b;
-
-                // if the mask value is negative, insert a '0' in the respective output position
-                // otherwise, lookup the value at mask position (bits 3..0 of the respective mask
-                // byte) in a and insert in output vector
-                insertValLow128b =
-                    (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
-                insertValHigh128b = (valHigh128bLane < 0)
-                                        ? C((char)0)
-                                        : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
-
-                vShuf = VINSERT(vShuf, insertValLow128b, i);
-                vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
-            }
-            res = vShuf;
-        }
-        return res;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
-    /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
-    /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
-    /// lower 8 values are used.
-    Value* Builder::PMOVSXBD(Value* a)
-    {
-        // VPMOVSXBD output type
-        Type* v8x32Ty = getVectorType(mInt32Ty, 8);
-        // Extract 8 values from 128bit lane and sign extend
-        return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
-    /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
-    /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
-    Value* Builder::PMOVSXWD(Value* a)
-    {
-        // VPMOVSXWD output type
-        Type* v8x32Ty = getVectorType(mInt32Ty, 8);
-        // Extract 8 values from 128bit lane and sign extend
-        return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
-    /// in LLVM IR.  If not supported on the underlying platform, emulate it
-    /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
-    Value* Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
-    {
-        // Bitcast Nxint16 to Nxhalf
-#if LLVM_VERSION_MAJOR >= 12
-        uint32_t numElems = cast<FixedVectorType>(a->getType())->getNumElements();
-#elif LLVM_VERSION_MAJOR >= 11
-        uint32_t numElems = cast<VectorType>(a->getType())->getNumElements();
-#else
-        uint32_t numElems = a->getType()->getVectorNumElements();
-#endif
-        Value*   input    = BITCAST(a, getVectorType(mFP16Ty, numElems));
-
-        return FP_EXT(input, getVectorType(mFP32Ty, numElems), name);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
-    /// in LLVM IR.  If not supported on the underlying platform, emulate it
-    /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
-    Value* Builder::CVTPS2PH(Value* a, Value* rounding)
-    {
-        if (JM()->mArch.F16C())
-        {
-            return VCVTPS2PH(a, rounding);
-        }
-        else
-        {
-            // call scalar C function for now
-            FunctionType* pFuncTy   = FunctionType::get(mInt16Ty, mFP32Ty);
-            Function*     pCvtPs2Ph = cast<Function>(
-#if LLVM_VERSION_MAJOR >= 9
-                JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy).getCallee());
-#else
-                JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
-#endif
-
-            if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
-            {
-                sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16",
-                                               (void*)&ConvertFloat32ToFloat16);
-            }
-
-            Value* pResult = UndefValue::get(mSimdInt16Ty);
-            for (uint32_t i = 0; i < mVWidth; ++i)
-            {
-                Value* pSrc  = VEXTRACT(a, C(i));
-                Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
-                pResult      = VINSERT(pResult, pConv, C(i));
-            }
-
-            return pResult;
-        }
-    }
-
-    Value* Builder::PMAXSD(Value* a, Value* b)
-    {
-        Value* cmp = ICMP_SGT(a, b);
-        return SELECT(cmp, a, b);
-    }
-
-    Value* Builder::PMINSD(Value* a, Value* b)
-    {
-        Value* cmp = ICMP_SLT(a, b);
-        return SELECT(cmp, a, b);
-    }
-
-    Value* Builder::PMAXUD(Value* a, Value* b)
-    {
-        Value* cmp = ICMP_UGT(a, b);
-        return SELECT(cmp, a, b);
-    }
-
-    Value* Builder::PMINUD(Value* a, Value* b)
-    {
-        Value* cmp = ICMP_ULT(a, b);
-        return SELECT(cmp, a, b);
-    }
-
-    // Helper function to create alloca in entry block of function
-    Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
-    {
-        auto saveIP = IRB()->saveIP();
-        IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
-        Value* pAlloca = ALLOCA(pType);
-        if (saveIP.isSet())
-            IRB()->restoreIP(saveIP);
-        return pAlloca;
-    }
-
-    Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
-    {
-        auto saveIP = IRB()->saveIP();
-        IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
-        Value* pAlloca = ALLOCA(pType, pArraySize);
-        if (saveIP.isSet())
-            IRB()->restoreIP(saveIP);
-        return pAlloca;
-    }
-
-    Value* Builder::VABSPS(Value* a)
-    {
-        Value* asInt  = BITCAST(a, mSimdInt32Ty);
-        Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
-        return result;
-    }
-
-    Value* Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
-    {
-        Value* lowCmp = ICMP_SLT(src, low);
-        Value* ret    = SELECT(lowCmp, low, src);
-
-        Value* highCmp = ICMP_SGT(ret, high);
-        ret            = SELECT(highCmp, high, ret, name);
-
-        return ret;
-    }
-
-    Value* Builder::FCLAMP(Value* src, Value* low, Value* high)
-    {
-        Value* lowCmp = FCMP_OLT(src, low);
-        Value* ret    = SELECT(lowCmp, low, src);
-
-        Value* highCmp = FCMP_OGT(ret, high);
-        ret            = SELECT(highCmp, high, ret);
-
-        return ret;
-    }
-
-    Value* Builder::FCLAMP(Value* src, float low, float high)
-    {
-        Value* result = VMAXPS(src, VIMMED1(low));
-        result        = VMINPS(result, VIMMED1(high));
-
-        return result;
-    }
-
-    Value* Builder::FMADDPS(Value* a, Value* b, Value* c)
-    {
-        Value* vOut;
-        // This maps to LLVM fmuladd intrinsic
-        vOut = VFMADDPS(a, b, c);
-        return vOut;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief pop count on vector mask (e.g. <8 x i1>)
-    Value* Builder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Float / Fixed-point conversions
-    //////////////////////////////////////////////////////////////////////////
-    Value* Builder::VCVT_F32_FIXED_SI(Value*             vFloat,
-                                      uint32_t           numIntBits,
-                                      uint32_t           numFracBits,
-                                      const llvm::Twine& name)
-    {
-        SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
-        Value* fixed = nullptr;
-
-#if 0   // This doesn't work for negative numbers!!
-        {
-            fixed = FP_TO_SI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
-                                    C(_MM_FROUND_TO_NEAREST_INT)),
-                             mSimdInt32Ty);
-        }
-        else
-#endif
-        {
-            // Do round to nearest int on fractional bits first
-            // Not entirely perfect for negative numbers, but close enough
-            vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
-                            C(_MM_FROUND_TO_NEAREST_INT));
-            vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits)));
-
-            // TODO: Handle INF, NAN, overflow / underflow, etc.
-
-            Value* vSgn      = FCMP_OLT(vFloat, VIMMED1(0.0f));
-            Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty);
-            Value* vFixed    = AND(vFloatInt, VIMMED1((1 << 23) - 1));
-            vFixed           = OR(vFixed, VIMMED1(1 << 23));
-            vFixed           = SELECT(vSgn, NEG(vFixed), vFixed);
-
-            Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24));
-            vExp        = SUB(vExp, VIMMED1(127));
-
-            Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp);
-
-            fixed = ASHR(vFixed, vExtraBits, name);
-        }
-
-        return fixed;
-    }
-
-    Value* Builder::VCVT_FIXED_SI_F32(Value*             vFixed,
-                                      uint32_t           numIntBits,
-                                      uint32_t           numFracBits,
-                                      const llvm::Twine& name)
-    {
-        SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
-        uint32_t extraBits = 32 - numIntBits - numFracBits;
-        if (numIntBits && extraBits)
-        {
-            // Sign extend
-            Value* shftAmt = VIMMED1(extraBits);
-            vFixed         = ASHR(SHL(vFixed, shftAmt), shftAmt);
-        }
-
-        Value* fVal  = VIMMED1(0.0f);
-        Value* fFrac = VIMMED1(0.0f);
-        if (numIntBits)
-        {
-            fVal = SI_TO_FP(ASHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name);
-        }
-
-        if (numFracBits)
-        {
-            fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty);
-            fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name);
-        }
-
-        return FADD(fVal, fFrac, name);
-    }
-
-    Value* Builder::VCVT_F32_FIXED_UI(Value*             vFloat,
-                                      uint32_t           numIntBits,
-                                      uint32_t           numFracBits,
-                                      const llvm::Twine& name)
-    {
-        SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
-        Value* fixed = nullptr;
-#if 1   // KNOB_SIM_FAST_MATH?  Below works correctly from a precision
-        // standpoint...
-        {
-            fixed = FP_TO_UI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
-                                    C(_MM_FROUND_TO_NEAREST_INT)),
-                             mSimdInt32Ty);
-        }
-#else
-        {
-            // Do round to nearest int on fractional bits first
-            vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
-                            C(_MM_FROUND_TO_NEAREST_INT));
-            vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits)));
-
-            // TODO: Handle INF, NAN, overflow / underflow, etc.
-
-            Value* vSgn      = FCMP_OLT(vFloat, VIMMED1(0.0f));
-            Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty);
-            Value* vFixed    = AND(vFloatInt, VIMMED1((1 << 23) - 1));
-            vFixed           = OR(vFixed, VIMMED1(1 << 23));
-
-            Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24));
-            vExp        = SUB(vExp, VIMMED1(127));
-
-            Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp);
-
-            fixed = LSHR(vFixed, vExtraBits, name);
-        }
-#endif
-        return fixed;
-    }
-
-    Value* Builder::VCVT_FIXED_UI_F32(Value*             vFixed,
-                                      uint32_t           numIntBits,
-                                      uint32_t           numFracBits,
-                                      const llvm::Twine& name)
-    {
-        SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
-        uint32_t extraBits = 32 - numIntBits - numFracBits;
-        if (numIntBits && extraBits)
-        {
-            // Sign extend
-            Value* shftAmt = VIMMED1(extraBits);
-            vFixed         = ASHR(SHL(vFixed, shftAmt), shftAmt);
-        }
-
-        Value* fVal  = VIMMED1(0.0f);
-        Value* fFrac = VIMMED1(0.0f);
-        if (numIntBits)
-        {
-            fVal = UI_TO_FP(LSHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name);
-        }
-
-        if (numFracBits)
-        {
-            fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty);
-            fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name);
-        }
-
-        return FADD(fVal, fFrac, name);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief C functions called by LLVM IR
-    //////////////////////////////////////////////////////////////////////////
-
-    Value* Builder::VEXTRACTI128(Value* a, Constant* imm8)
-    {
-        bool                      flag = !imm8->isZeroValue();
-        SmallVector<Constant*, 8> idx;
-        for (unsigned i = 0; i < mVWidth / 2; i++)
-        {
-            idx.push_back(C(flag ? i + mVWidth / 2 : i));
-        }
-        return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
-    }
-
-    Value* Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
-    {
-        bool                      flag = !imm8->isZeroValue();
-        SmallVector<Constant*, 8> idx;
-        for (unsigned i = 0; i < mVWidth; i++)
-        {
-            idx.push_back(C(i));
-        }
-        Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
-
-        SmallVector<Constant*, 8> idx2;
-        for (unsigned i = 0; i < mVWidth / 2; i++)
-        {
-            idx2.push_back(C(flag ? i : i + mVWidth));
-        }
-        for (unsigned i = mVWidth / 2; i < mVWidth; i++)
-        {
-            idx2.push_back(C(flag ? i + mVWidth / 2 : i));
-        }
-        return VSHUFFLE(a, inter, ConstantVector::get(idx2));
-    }
-
-    // rdtsc buckets macros
-    void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
-    {
-        // @todo due to an issue with thread local storage propagation in llvm, we can only safely
-        // call into buckets framework when single threaded
-        if (KNOB_SINGLE_THREADED)
-        {
-            std::vector<Type*> args{
-                PointerType::get(mInt32Ty, 0), // pBucketMgr
-                mInt32Ty                       // id
-            };
-
-            FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
-            Function*     pFunc   = cast<Function>(
-#if LLVM_VERSION_MAJOR >= 9
-                JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy).getCallee());
-#else
-                JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
-#endif
-            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") ==
-                nullptr)
-            {
-                sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket",
-                                               (void*)&BucketManager_StartBucket);
-            }
-
-            CALL(pFunc, {pBucketMgr, pId});
-        }
-    }
-
-    void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
-    {
-        // @todo due to an issue with thread local storage propagation in llvm, we can only safely
-        // call into buckets framework when single threaded
-        if (KNOB_SINGLE_THREADED)
-        {
-            std::vector<Type*> args{
-                PointerType::get(mInt32Ty, 0), // pBucketMgr
-                mInt32Ty                       // id
-            };
-
-            FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
-            Function*     pFunc   = cast<Function>(
-#if LLVM_VERSION_MAJOR >= 9
-                JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy).getCallee());
-#else
-                JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
-#endif
-            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") ==
-                nullptr)
-            {
-                sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket",
-                                               (void*)&BucketManager_StopBucket);
-            }
-
-            CALL(pFunc, {pBucketMgr, pId});
-        }
-    }
-
-    uint32_t Builder::GetTypeSize(Type* pType)
-    {
-        if (pType->isStructTy())
-        {
-            uint32_t numElems = pType->getStructNumElements();
-            Type*    pElemTy  = pType->getStructElementType(0);
-            return numElems * GetTypeSize(pElemTy);
-        }
-
-        if (pType->isArrayTy())
-        {
-            uint32_t numElems = pType->getArrayNumElements();
-            Type*    pElemTy  = pType->getArrayElementType();
-            return numElems * GetTypeSize(pElemTy);
-        }
-
-        if (pType->isIntegerTy())
-        {
-            uint32_t bitSize = pType->getIntegerBitWidth();
-            return bitSize / 8;
-        }
-
-        if (pType->isFloatTy())
-        {
-            return 4;
-        }
-
-        if (pType->isHalfTy())
-        {
-            return 2;
-        }
-
-        if (pType->isDoubleTy())
-        {
-            return 8;
-        }
-
-        SWR_ASSERT(false, "Unimplemented type.");
-        return 0;
-    }
-} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
deleted file mode 100644
index a7d69eaf9d0..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file builder_misc.h
- *
- * @brief miscellaneous builder functions
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-Constant* C(bool i);
-Constant* C(char i);
-Constant* C(uint8_t i);
-Constant* C(int i);
-Constant* C(int64_t i);
-Constant* C(uint64_t i);
-Constant* C(uint16_t i);
-Constant* C(uint32_t i);
-Constant* C(float i);
-
-template <typename Ty>
-Constant* C(const std::initializer_list<Ty>& constList)
-{
-    std::vector<Constant*> vConsts;
-    for (auto i : constList)
-    {
-        vConsts.push_back(C((Ty)i));
-    }
-    return ConstantVector::get(vConsts);
-}
-
-template <typename Ty>
-Constant* C(const std::vector<Ty>& constList)
-{
-    std::vector<Constant*> vConsts;
-    for (auto i : constList)
-    {
-        vConsts.push_back(C((Ty)i));
-    }
-    return ConstantVector::get(vConsts);
-}
-
-template <typename Ty>
-Constant* CA(LLVMContext& ctx, ArrayRef<Ty> constList)
-{
-    return ConstantDataArray::get(ctx, constList);
-}
-
-template <typename Ty>
-Constant* CInc(uint32_t base, uint32_t count)
-{
-    std::vector<Constant*> vConsts;
-
-    for (uint32_t i = 0; i < count; i++)
-    {
-        vConsts.push_back(C((Ty)base));
-        base++;
-    }
-    return ConstantVector::get(vConsts);
-}
-
-Constant* PRED(bool pred);
-
-Value* VIMMED1(uint64_t i);
-Value* VIMMED1_16(uint64_t i);
-
-Value* VIMMED1(int i);
-Value* VIMMED1_16(int i);
-
-Value* VIMMED1(uint32_t i);
-Value* VIMMED1_16(uint32_t i);
-
-Value* VIMMED1(float i);
-Value* VIMMED1_16(float i);
-
-Value* VIMMED1(bool i);
-Value* VIMMED1_16(bool i);
-
-Value* VUNDEF(Type* t);
-
-Value* VUNDEF_F();
-Value* VUNDEF_F_16();
-
-Value* VUNDEF_I();
-Value* VUNDEF_I_16();
-
-Value* VUNDEF(Type* ty, uint32_t size);
-
-Value* VUNDEF_IPTR();
-
-Value* VBROADCAST(Value* src, const llvm::Twine& name = "");
-Value* VBROADCAST_16(Value* src);
-
-Value* VRCP(Value* va, const llvm::Twine& name = "");
-Value* VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY);
-
-uint32_t IMMED(Value* i);
-int32_t  S_IMMED(Value* i);
-
-CallInst* CALL(Value* Callee, const std::initializer_list<Value*>& args, const llvm::Twine& name = "");
-CallInst* CALL(Value* Callee)
-{
-#if LLVM_VERSION_MAJOR >= 11
-    // Not a great idea - we loose type info (Function) calling CALL
-    // and then we recast it here. Good for now, but needs to be
-    // more clean - optimally just always CALL a Function
-    return CALLA(FunctionCallee(cast<Function>(Callee)));
-#else
-    return CALLA(Callee);
-#endif
-}
-CallInst* CALL(Value* Callee, Value* arg);
-CallInst* CALL2(Value* Callee, Value* arg1, Value* arg2);
-CallInst* CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3);
-
-Value* MASK(Value* vmask);
-Value* MASK_16(Value* vmask);
-
-Value* VMASK(Value* mask);
-Value* VMASK_16(Value* mask);
-
-Value* VMOVMSK(Value* mask);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Float / Fixed-point conversions
-//////////////////////////////////////////////////////////////////////////
-// Signed
-Value* VCVT_F32_FIXED_SI(Value*             vFloat,
-                         uint32_t           numIntBits,
-                         uint32_t           numFracBits,
-                         const llvm::Twine& name = "");
-Value* VCVT_FIXED_SI_F32(Value*             vFixed,
-                         uint32_t           numIntBits,
-                         uint32_t           numFracBits,
-                         const llvm::Twine& name = "");
-// Unsigned
-Value* VCVT_F32_FIXED_UI(Value*             vFloat,
-                         uint32_t           numIntBits,
-                         uint32_t           numFracBits,
-                         const llvm::Twine& name = "");
-Value* VCVT_FIXED_UI_F32(Value*             vFixed,
-                         uint32_t           numIntBits,
-                         uint32_t           numFracBits,
-                         const llvm::Twine& name = "");
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief functions that build IR to call x86 intrinsics directly, or
-/// emulate them with other instructions if not available on the host
-//////////////////////////////////////////////////////////////////////////
-
-Value* EXTRACT_16(Value* x, uint32_t imm);
-Value* JOIN_16(Value* a, Value* b);
-
-Value* PSHUFB(Value* a, Value* b);
-Value* PMOVSXBD(Value* a);
-Value* PMOVSXWD(Value* a);
-Value* CVTPH2PS(Value* a, const llvm::Twine& name = "");
-Value* CVTPS2PH(Value* a, Value* rounding);
-Value* PMAXSD(Value* a, Value* b);
-Value* PMINSD(Value* a, Value* b);
-Value* PMAXUD(Value* a, Value* b);
-Value* PMINUD(Value* a, Value* b);
-Value* VABSPS(Value* a);
-Value* FMADDPS(Value* a, Value* b, Value* c);
-
-Value* ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name = "");
-Value* FCLAMP(Value* src, Value* low, Value* high);
-Value* FCLAMP(Value* src, float low, float high);
-
-CallInst* PRINT(const std::string& printStr);
-CallInst* PRINT(const std::string& printStr, const std::initializer_list<Value*>& printArgs);
-
-Value* VPOPCNT(Value* a);
-
-Value* INT3()
-{
-    return DEBUGTRAP();
-}
-
-
-Value* VEXTRACTI128(Value* a, Constant* imm8);
-Value* VINSERTI128(Value* a, Value* b, Constant* imm8);
-
-// rdtsc buckets macros
-void RDTSC_START(Value* pBucketMgr, Value* pId);
-void RDTSC_STOP(Value* pBucketMgr, Value* pId);
-
-Value* CreateEntryAlloca(Function* pFunc, Type* pType);
-Value* CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize);
-
-uint32_t GetTypeSize(Type* pType);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
deleted file mode 100644
index bd5f7588c91..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ /dev/null
@@ -1,2332 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file fetch_jit.cpp
- *
- * @brief Implementation of the fetch jitter
- *
- * Notes:
- *
- ******************************************************************************/
-#include "jit_pch.hpp"
-#include "builder_gfx_mem.h"
-#include "jit_api.h"
-#include "fetch_jit.h"
-#include "gen_state_llvm.h"
-#include "functionpasses/passes.h"
-
-//#define FETCH_DUMP_VERTEX 1
-using namespace llvm;
-using namespace SwrJit;
-
-bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
-
-enum ConversionType
-{
-    CONVERT_NONE,
-    CONVERT_NORMALIZED,
-    CONVERT_USCALED,
-    CONVERT_SSCALED,
-    CONVERT_SFIXED,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Interface to Jitting a fetch shader
-//////////////////////////////////////////////////////////////////////////
-struct FetchJit : public BuilderGfxMem
-{
-    FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr), mpFetchInfo(NULL) {}
-
-    Function* Create(const FETCH_COMPILE_STATE& fetchState);
-
-    Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
-    Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
-    Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
-    template <typename T>
-    Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
-
-    // package up Shuffle*bpcGatherd args into a tuple for convenience
-    typedef std::tuple<Value*&,
-                       Value*,
-                       const Instruction::CastOps,
-                       const ConversionType,
-                       uint32_t&,
-                       uint32_t&,
-                       const ComponentEnable,
-                       const ComponentControl (&)[4],
-                       Value* (&)[4],
-                       const uint32_t (&)[4]>
-        Shuffle8bpcArgs;
-
-    void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
-    void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
-
-    typedef std::tuple<Value* (&)[2],
-                       Value*,
-                       const Instruction::CastOps,
-                       const ConversionType,
-                       uint32_t&,
-                       uint32_t&,
-                       const ComponentEnable,
-                       const ComponentControl (&)[4],
-                       Value* (&)[4]>
-        Shuffle16bpcArgs;
-
-    void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
-    void Shuffle16bpcGather(Shuffle16bpcArgs& args);
-
-    void StoreVertexElements(Value*         pVtxOut,
-                             const uint32_t outputElt,
-                             const uint32_t numEltsToStore,
-                             Value* (&vVertexElements)[4]);
-
-    Value* GenerateCompCtrlVector(const ComponentControl ctrl);
-
-    void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
-                           Value*                     streams,
-                           Value*                     vIndices,
-                           Value*                     pVtxOut);
-
-    bool IsOddFormat(SWR_FORMAT format);
-    bool IsUniformFormat(SWR_FORMAT format);
-    void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
-    void CreateGatherOddFormats(
-        SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
-    void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
-
-    Value* mpFetchInfo;
-};
-
-Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
-{
-    std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-    fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
-
-    Function* fetch = Function::Create(
-        JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
-    BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
-
-    fetch->getParent()->setModuleIdentifier(fetch->getName());
-
-    IRB()->SetInsertPoint(entry);
-
-    auto argitr = fetch->arg_begin();
-
-    // Fetch shader arguments
-    Value* privateContext = &*argitr;
-    ++argitr;
-    privateContext->setName("privateContext");
-    SetPrivateContext(privateContext);
-
-    mpWorkerData = &*argitr;
-    ++argitr;
-    mpWorkerData->setName("pWorkerData");
-
-    mpFetchInfo = &*argitr;
-    ++argitr;
-    mpFetchInfo->setName("fetchInfo");
-    Value* pVtxOut = &*argitr;
-    pVtxOut->setName("vtxOutput");
-
-    uint32_t baseWidth = mVWidth;
-
-    SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
-
-    // Override builder target width to force 16-wide SIMD
-#if USE_SIMD16_SHADERS
-    SetTargetWidth(16);
-#endif
-
-    pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
-
-    // SWR_FETCH_CONTEXT::pStreams
-    Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
-    streams->setName("pStreams");
-
-    // SWR_FETCH_CONTEXT::pIndices
-    Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
-    indices->setName("pIndices");
-
-    // SWR_FETCH_CONTEXT::pLastIndex
-    Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
-    pLastIndex->setName("pLastIndex");
-
-    Value* vIndices;
-    switch (fetchState.indexType)
-    {
-    case R8_UINT:
-        indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
-        if (fetchState.bDisableIndexOOBCheck)
-        {
-            vIndices = LOAD(
-                BITCAST(indices, PointerType::get(getVectorType(mInt8Ty, mpJitMgr->mVWidth), 0)),
-                {(uint32_t)0});
-            vIndices = Z_EXT(vIndices, mSimdInt32Ty);
-        }
-        else
-        {
-            vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
-        }
-        break;
-    case R16_UINT:
-        if (fetchState.bDisableIndexOOBCheck)
-        {
-            vIndices = LOAD(
-                BITCAST(indices, PointerType::get(getVectorType(mInt16Ty, mpJitMgr->mVWidth), 0)),
-                {(uint32_t)0});
-            vIndices = Z_EXT(vIndices, mSimdInt32Ty);
-        }
-        else
-        {
-            vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
-        }
-        break;
-    case R32_UINT:
-        (fetchState.bDisableIndexOOBCheck)
-            ? vIndices = LOAD(indices,
-                              "",
-                              PointerType::get(mSimdInt32Ty, 0),
-                              MEM_CLIENT::GFX_MEM_CLIENT_FETCH)
-            : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
-        break; // incoming type is already 32bit int
-    default:
-        vIndices = nullptr;
-        assert(false && "Unsupported index type");
-        break;
-    }
-
-    if (fetchState.bForceSequentialAccessEnable)
-    {
-        Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
-                                       : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
-
-        // VertexData buffers are accessed sequentially, the index is equal to the vertex number
-        vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
-        vIndices = ADD(vIndices, pOffsets);
-    }
-
-    Value* vVertexId = vIndices;
-    if (fetchState.bVertexIDOffsetEnable)
-    {
-        // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
-        // correct
-        Value* vBaseVertex  = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
-        Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
-        vVertexId           = ADD(vIndices, vBaseVertex);
-        vVertexId           = ADD(vVertexId, vStartVertex);
-    }
-
-    // store out vertex IDs
-    if (mVWidth == 16)
-    {
-        // store out in simd8 halves until core supports 16-wide natively
-        auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
-        auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
-        STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
-        STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
-    }
-    else if (mVWidth == 8)
-    {
-        STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
-    }
-
-    // store out cut mask if enabled
-    if (fetchState.bEnableCutIndex)
-    {
-        Value* vCutIndex = VIMMED1(fetchState.cutIndex);
-        Value* cutMask   = VMASK(ICMP_EQ(vIndices, vCutIndex));
-
-        if (mVWidth == 16)
-        {
-            auto cutMaskLo = EXTRACT_16(cutMask, 0);
-            auto cutMaskHi = EXTRACT_16(cutMask, 1);
-            STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
-            STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
-        }
-        else if (mVWidth == 8)
-        {
-            STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
-        }
-    }
-
-    // Fetch attributes from memory and output to a simdvertex struct
-    JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
-
-    RET_VOID();
-
-    JitManager::DumpToFile(fetch, "src");
-
-#if defined(_DEBUG)
-    verifyFunction(*fetch);
-#endif
-
-    ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
-
-    ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
-    setupPasses.add(createBreakCriticalEdgesPass());
-    setupPasses.add(createCFGSimplificationPass());
-    setupPasses.add(createEarlyCSEPass());
-    setupPasses.add(createPromoteMemoryToRegisterPass());
-
-    setupPasses.run(*fetch);
-
-    JitManager::DumpToFile(fetch, "se");
-
-    ::FunctionPassManager optPasses(JM()->mpCurrentModule);
-
-    ///@todo Haven't touched these either. Need to remove some of these and add others.
-    optPasses.add(createCFGSimplificationPass());
-    optPasses.add(createEarlyCSEPass());
-    optPasses.add(createInstructionCombiningPass());
-#if LLVM_VERSION_MAJOR <= 11
-    optPasses.add(createConstantPropagationPass());
-#endif
-    optPasses.add(createSCCPPass());
-    optPasses.add(createAggressiveDCEPass());
-
-    optPasses.run(*fetch);
-
-    optPasses.add(createLowerX86Pass(this));
-    optPasses.run(*fetch);
-
-    JitManager::DumpToFile(fetch, "opt");
-
-
-    // Revert 16-wide override
-#if USE_SIMD16_SHADERS
-    SetTargetWidth(baseWidth);
-#endif
-
-    return fetch;
-}
-
-// returns true for odd formats that require special state.gather handling
-bool FetchJit::IsOddFormat(SWR_FORMAT format)
-{
-    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-    if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
-    {
-        return true;
-    }
-    return false;
-}
-
-// format is uniform if all components are the same size and type
-bool FetchJit::IsUniformFormat(SWR_FORMAT format)
-{
-    const SWR_FORMAT_INFO& info  = GetFormatInfo(format);
-    uint32_t               bpc0  = info.bpc[0];
-    uint32_t               type0 = info.type[0];
-
-    for (uint32_t c = 1; c < info.numComps; ++c)
-    {
-        if (bpc0 != info.bpc[c] || type0 != info.type[c])
-        {
-            return false;
-        }
-    }
-    return true;
-}
-
-// unpacks components based on format
-// foreach component in the pixel
-//   mask off everything but this component
-//   shift component to LSB
-void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
-{
-    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-
-    uint32_t bitOffset = 0;
-    for (uint32_t c = 0; c < info.numComps; ++c)
-    {
-        uint32_t swizzledIndex = info.swizzle[c];
-        uint32_t compBits      = info.bpc[c];
-        uint32_t bitmask       = ((1 << compBits) - 1) << bitOffset;
-        Value*   comp          = AND(vInput, bitmask);
-        comp                   = LSHR(comp, bitOffset);
-
-        result[swizzledIndex] = comp;
-        bitOffset += compBits;
-    }
-}
-
-// gather for odd component size formats
-// gather SIMD full pixels per lane then shift/mask to move each component to their
-// own vector
-void FetchJit::CreateGatherOddFormats(
-    SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4])
-{
-    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-
-    // only works if pixel size is <= 32bits
-    SWR_ASSERT(info.bpp <= 32);
-
-    Value* pGather;
-    if (info.bpp == 32)
-    {
-        pGather =
-            GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
-    }
-    else
-    {
-        // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
-        Value* pMem = ALLOCA(mSimdInt32Ty);
-        STORE(VIMMED1(0u), pMem);
-
-        Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy);
-
-        for (uint32_t lane = 0; lane < mVWidth; ++lane)
-        {
-            // Get index
-            Value* index = VEXTRACT(pOffsets, C(lane));
-            Value* mask  = VEXTRACT(pMask, C(lane));
-
-            // use branch around load based on mask
-            // Needed to avoid page-faults on unmasked lanes
-            BasicBlock* pCurrentBB = IRB()->GetInsertBlock();
-            BasicBlock* pMaskedLoadBlock =
-                BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent());
-            BasicBlock* pEndLoadBB =
-                BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent());
-
-            COND_BR(mask, pMaskedLoadBlock, pEndLoadBB);
-
-            JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock);
-
-            switch (info.bpp)
-            {
-            case 8:
-            {
-                Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
-                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
-                STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
-                break;
-            }
-
-            case 16:
-            {
-                Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
-                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
-                STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
-                break;
-            }
-            break;
-
-            case 24:
-            {
-                // First 16-bits of data
-                Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
-                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
-                STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
-
-                // Last 8-bits of data
-                pDst  = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
-                xpSrc = ADD(xpSrc, C((int64_t)2));
-                STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
-                break;
-            }
-
-            default:
-                SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
-                break;
-            }
-
-            BR(pEndLoadBB);
-            JM()->mBuilder.SetInsertPoint(pEndLoadBB);
-        }
-
-        pGather = LOAD(pMem);
-    }
-
-    for (uint32_t comp = 0; comp < 4; ++comp)
-    {
-        pResult[comp] = VIMMED1((int)info.defaults[comp]);
-    }
-
-    UnpackComponents(format, pGather, pResult);
-
-    // cast to fp32
-    pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
-    pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
-    pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
-    pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
-}
-
-void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
-{
-    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-
-    for (uint32_t c = 0; c < info.numComps; ++c)
-    {
-        uint32_t compIndex = info.swizzle[c];
-
-        // skip any conversion on UNUSED components
-        if (info.type[c] == SWR_TYPE_UNUSED)
-        {
-            continue;
-        }
-
-        if (info.isNormalized[c])
-        {
-            if (info.type[c] == SWR_TYPE_SNORM)
-            {
-                /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
-                /// -1.0f.
-
-                /// result = c * (1.0f / (2^(n-1) - 1);
-                uint32_t n        = info.bpc[c];
-                uint32_t pow2     = 1 << (n - 1);
-                float    scale    = 1.0f / (float)(pow2 - 1);
-                Value*   vScale   = VIMMED1(scale);
-                texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
-                texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
-                texels[compIndex] = FMUL(texels[compIndex], vScale);
-            }
-            else
-            {
-                SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
-
-                /// result = c * (1.0f / (2^n - 1))
-                uint32_t n    = info.bpc[c];
-                uint32_t pow2 = 1 << n;
-                // special case 24bit unorm format, which requires a full divide to meet ULP
-                // requirement
-                if (n == 24)
-                {
-                    float  scale      = (float)(pow2 - 1);
-                    Value* vScale     = VIMMED1(scale);
-                    texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
-                    texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
-                    texels[compIndex] = FDIV(texels[compIndex], vScale);
-                }
-                else
-                {
-                    float  scale      = 1.0f / (float)(pow2 - 1);
-                    Value* vScale     = VIMMED1(scale);
-                    texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
-                    texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
-                    texels[compIndex] = FMUL(texels[compIndex], vScale);
-                }
-            }
-            continue;
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Loads attributes from memory using AVX2 GATHER(s)
-/// @param fetchState - info about attributes to be fetched from memory
-/// @param streams - value pointer to the current vertex stream
-/// @param vIndices - vector value of indices to gather
-/// @param pVtxOut - value pointer to output simdvertex struct
-void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
-                                 Value*                     streams,
-                                 Value*                     vIndices,
-                                 Value*                     pVtxOut)
-{
-    uint32_t currentVertexElement = 0;
-    uint32_t outputElt            = 0;
-    Value*   vVertexElements[4];
-
-    Value* startVertex   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
-    Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
-    Value* curInstance   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
-    Value* vBaseVertex   = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
-    curInstance->setName("curInstance");
-
-    for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
-    {
-        const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
-
-        // skip element if all components are disabled
-        if (ied.ComponentPacking == ComponentEnable::NONE)
-        {
-            continue;
-        }
-
-        const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
-        SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
-        uint32_t bpc =
-            info.bpp /
-            info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
-
-        Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
-
-        Value* stride  = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
-        Value* vStride = VBROADCAST(stride);
-
-        // max vertex index that is fully in bounds
-        Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
-        maxVertex        = LOAD(maxVertex);
-
-        Value* minVertex = NULL;
-        if (fetchState.bPartialVertexBuffer)
-        {
-            // min vertex index for low bounds OOB checking
-            minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
-            minVertex = LOAD(minVertex);
-        }
-
-        if (fetchState.bInstanceIDOffsetEnable)
-        {
-            // the InstanceID (curInstance) value is offset by StartInstanceLocation
-            curInstance = ADD(curInstance, startInstance);
-        }
-
-        Value* vCurIndices;
-        Value* startOffset;
-        Value* vInstanceStride = VIMMED1(0);
-
-        if (ied.InstanceEnable)
-        {
-            Value* stepRate = C(ied.InstanceAdvancementState);
-
-            // prevent a div by 0 for 0 step rate
-            Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
-            stepRate             = SELECT(isNonZeroStep, stepRate, C(1));
-
-            // calc the current offset into instanced data buffer
-            Value* calcInstance = UDIV(curInstance, stepRate);
-
-            // if step rate is 0, every instance gets instance 0
-            calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
-
-            vCurIndices = VBROADCAST(calcInstance);
-            startOffset = startInstance;
-        }
-        else if (ied.InstanceStrideEnable)
-        {
-            // grab the instance advancement state, determines stride in bytes from one instance to
-            // the next
-            Value* stepRate = C(ied.InstanceAdvancementState);
-            vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
-
-            // offset indices by baseVertex
-            vCurIndices = ADD(vIndices, vBaseVertex);
-
-            startOffset = startVertex;
-            SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
-        }
-        else
-        {
-            // offset indices by baseVertex
-            vCurIndices = ADD(vIndices, vBaseVertex);
-            startOffset = startVertex;
-        }
-
-        // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
-        // do 64bit address offset calculations.
-
-        // calculate byte offset to the start of the VB
-        Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
-
-        // VGATHER* takes an *i8 src pointer so that's what stream is
-        Value* pStreamBaseGFX = ADD(stream, baseOffset);
-
-        // if we have a start offset, subtract from max vertex. Used for OOB check
-        maxVertex     = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
-        Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
-        // if we have a negative value, we're already OOB. clamp at 0.
-        maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
-
-        if (fetchState.bPartialVertexBuffer)
-        {
-            // similary for min vertex
-            minVertex     = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
-            Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
-            minVertex     = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
-        }
-
-        // Load the in bounds size of a partially valid vertex
-        Value* partialInboundsSize =
-            GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
-        partialInboundsSize       = LOAD(partialInboundsSize);
-        Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
-        Value* vBpp               = VBROADCAST(C(info.Bpp));
-        Value* vAlignmentOffsets  = VBROADCAST(C(ied.AlignedByteOffset));
-
-        // is the element is <= the partially valid size
-        Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
-
-        // override cur indices with 0 if pitch is 0
-        Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
-        vCurIndices           = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
-
-        // are vertices partially OOB?
-        Value* vMaxVertex      = VBROADCAST(maxVertex);
-        Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
-
-        // are vertices fully in bounds?
-        Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
-
-        Value* vGatherMask;
-        if (fetchState.bPartialVertexBuffer)
-        {
-            // are vertices below minVertex limit?
-            Value* vMinVertex     = VBROADCAST(minVertex);
-            Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
-
-            // only fetch lanes that pass both tests
-            vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
-        }
-        else
-        {
-            vGatherMask = vMaxGatherMask;
-        }
-
-        // blend in any partially OOB indices that have valid elements
-        vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
-
-        // calculate the actual offsets into the VB
-        Value* vOffsets = MUL(vCurIndices, vStride);
-        vOffsets        = ADD(vOffsets, vAlignmentOffsets);
-
-        // if instance stride enable is:
-        //  true  - add product of the instanceID and advancement state to the offset into the VB
-        //  false - value of vInstanceStride has been initialized to zero
-        vOffsets = ADD(vOffsets, vInstanceStride);
-
-        // Packing and component control
-        ComponentEnable        compMask = (ComponentEnable)ied.ComponentPacking;
-        const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
-                                           (ComponentControl)ied.ComponentControl1,
-                                           (ComponentControl)ied.ComponentControl2,
-                                           (ComponentControl)ied.ComponentControl3};
-
-        // Special gather/conversion for formats without equal component sizes
-        if (IsOddFormat((SWR_FORMAT)ied.Format))
-        {
-            Value* pResults[4];
-            CreateGatherOddFormats(
-                (SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults);
-            ConvertFormat((SWR_FORMAT)ied.Format, pResults);
-
-            for (uint32_t c = 0; c < 4; c += 1)
-            {
-                if (isComponentEnabled(compMask, c))
-                {
-                    vVertexElements[currentVertexElement++] = pResults[c];
-                    if (currentVertexElement > 3)
-                    {
-                        StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                        // reset to the next vVertexElement to output
-                        currentVertexElement = 0;
-                    }
-                }
-            }
-        }
-        else if (info.type[0] == SWR_TYPE_FLOAT)
-        {
-            ///@todo: support 64 bit vb accesses
-            Value* gatherSrc = VIMMED1(0.0f);
-
-            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
-                       "Unsupported format for standard gather fetch.");
-
-            // Gather components from memory to store in a simdvertex structure
-            switch (bpc)
-            {
-            case 16:
-            {
-                Value* vGatherResult[2];
-
-                // if we have at least one component out of x or y to fetch
-                if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
-                {
-                    vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
-                    // e.g. result of first 8x32bit integer gather for 16bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-                    //
-                }
-
-                // if we have at least one component out of z or w to fetch
-                if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
-                {
-                    // offset base to the next components(zw) in the vertex to gather
-                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
-
-                    vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
-                    // e.g. result of second 8x32bit integer gather for 16bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
-                    //
-                }
-
-                // if we have at least one component to shuffle into place
-                if (compMask)
-                {
-                    Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
-                                                                  pVtxOut,
-                                                                  Instruction::CastOps::FPExt,
-                                                                  CONVERT_NONE,
-                                                                  currentVertexElement,
-                                                                  outputElt,
-                                                                  compMask,
-                                                                  compCtrl,
-                                                                  vVertexElements);
-
-                    // Shuffle gathered components into place in simdvertex struct
-                    mVWidth == 16 ? Shuffle16bpcGather16(args)
-                                  : Shuffle16bpcGather(args); // outputs to vVertexElements ref
-                }
-            }
-            break;
-            case 32:
-            {
-                for (uint32_t i = 0; i < 4; i += 1)
-                {
-                    if (isComponentEnabled(compMask, i))
-                    {
-                        // if we need to gather the component
-                        if (compCtrl[i] == StoreSrc)
-                        {
-                            // Gather a SIMD of vertices
-                            // APIs allow a 4GB range for offsets
-                            // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :(
-                            // Add 2GB to the base pointer and 2GB to the offsets.  This makes
-                            // "negative" (large) offsets into positive offsets and small offsets
-                            // into negative offsets.
-                            Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000));
-                            vVertexElements[currentVertexElement++] =
-                                GATHERPS(gatherSrc,
-                                         ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)),
-                                         vNewOffsets,
-                                         vGatherMask,
-                                         1,
-                                         MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
-                        }
-                        else
-                        {
-                            vVertexElements[currentVertexElement++] =
-                                GenerateCompCtrlVector(compCtrl[i]);
-                        }
-
-                        if (currentVertexElement > 3)
-                        {
-                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                            // reset to the next vVertexElement to output
-                            currentVertexElement = 0;
-                        }
-                    }
-
-                    // offset base to the next component in the vertex to gather
-                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
-                }
-            }
-            break;
-            case 64:
-            {
-                for (uint32_t i = 0; i < 4; i += 1)
-                {
-                    if (isComponentEnabled(compMask, i))
-                    {
-                        // if we need to gather the component
-                        if (compCtrl[i] == StoreSrc)
-                        {
-                            Value* vShufLo;
-                            Value* vShufHi;
-                            Value* vShufAll;
-
-                            if (mVWidth == 8)
-                            {
-                                vShufLo  = C({0, 1, 2, 3});
-                                vShufHi  = C({4, 5, 6, 7});
-                                vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
-                            }
-                            else
-                            {
-                                SWR_ASSERT(mVWidth == 16);
-                                vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
-                                vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
-                                vShufAll =
-                                    C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
-                            }
-
-                            Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
-                            Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
-
-                            Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
-                            Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
-
-                            Value* vZeroDouble = VECTOR_SPLAT(
-                                mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
-
-                            Value* pGatherLo =
-                                GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo);
-                            Value* pGatherHi =
-                                GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi);
-
-                            Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
-                            pGather        = FP_TRUNC(pGather, mSimdFP32Ty);
-
-                            vVertexElements[currentVertexElement++] = pGather;
-                        }
-                        else
-                        {
-                            vVertexElements[currentVertexElement++] =
-                                GenerateCompCtrlVector(compCtrl[i]);
-                        }
-
-                        if (currentVertexElement > 3)
-                        {
-                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                            // reset to the next vVertexElement to output
-                            currentVertexElement = 0;
-                        }
-                    }
-
-                    // offset base to the next component  in the vertex to gather
-                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8));
-                }
-            }
-            break;
-            default:
-                SWR_INVALID("Tried to fetch invalid FP format");
-                break;
-            }
-        }
-        else
-        {
-            Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
-            ConversionType       conversionType = CONVERT_NONE;
-
-            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
-                       "Unsupported format for standard gather fetch.");
-
-            switch (info.type[0])
-            {
-            case SWR_TYPE_UNORM:
-                conversionType = CONVERT_NORMALIZED;
-            case SWR_TYPE_UINT:
-                extendCastType = Instruction::CastOps::ZExt;
-                break;
-            case SWR_TYPE_SNORM:
-                conversionType = CONVERT_NORMALIZED;
-            case SWR_TYPE_SINT:
-                extendCastType = Instruction::CastOps::SExt;
-                break;
-            case SWR_TYPE_USCALED:
-                conversionType = CONVERT_USCALED;
-                extendCastType = Instruction::CastOps::UIToFP;
-                break;
-            case SWR_TYPE_SSCALED:
-                conversionType = CONVERT_SSCALED;
-                extendCastType = Instruction::CastOps::SIToFP;
-                break;
-            case SWR_TYPE_SFIXED:
-                conversionType = CONVERT_SFIXED;
-                extendCastType = Instruction::CastOps::SExt;
-                break;
-            default:
-                break;
-            }
-
-            // value substituted when component of gather is masked
-            Value* gatherSrc = VIMMED1(0);
-
-            // Gather components from memory to store in a simdvertex structure
-            switch (bpc)
-            {
-            case 8:
-            {
-                // if we have at least one component to fetch
-                if (compMask)
-                {
-                    Value* vGatherResult = GATHERDD(gatherSrc,
-                                                    pStreamBaseGFX,
-                                                    vOffsets,
-                                                    vGatherMask,
-                                                    1,
-                                                    MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
-                    // e.g. result of an 8x32bit integer gather for 8bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
-
-                    Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
-                                                                 pVtxOut,
-                                                                 extendCastType,
-                                                                 conversionType,
-                                                                 currentVertexElement,
-                                                                 outputElt,
-                                                                 compMask,
-                                                                 compCtrl,
-                                                                 vVertexElements,
-                                                                 info.swizzle);
-
-                    // Shuffle gathered components into place in simdvertex struct
-                    mVWidth == 16 ? Shuffle8bpcGatherd16(args)
-                                  : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
-                }
-            }
-            break;
-            case 16:
-            {
-                Value* vGatherResult[2];
-
-                // if we have at least one component out of x or y to fetch
-                if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
-                {
-                    vGatherResult[0] = GATHERDD(gatherSrc,
-                                                pStreamBaseGFX,
-                                                vOffsets,
-                                                vGatherMask,
-                                                1,
-                                                MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
-                    // e.g. result of first 8x32bit integer gather for 16bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-                    //
-                }
-
-                // if we have at least one component out of z or w to fetch
-                if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
-                {
-                    // offset base to the next components(zw) in the vertex to gather
-                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
-
-                    vGatherResult[1] = GATHERDD(gatherSrc,
-                                                pStreamBaseGFX,
-                                                vOffsets,
-                                                vGatherMask,
-                                                1,
-                                                MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
-                    // e.g. result of second 8x32bit integer gather for 16bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
-                    //
-                }
-
-                // if we have at least one component to shuffle into place
-                if (compMask)
-                {
-                    Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
-                                                                  pVtxOut,
-                                                                  extendCastType,
-                                                                  conversionType,
-                                                                  currentVertexElement,
-                                                                  outputElt,
-                                                                  compMask,
-                                                                  compCtrl,
-                                                                  vVertexElements);
-
-                    // Shuffle gathered components into place in simdvertex struct
-                    mVWidth == 16 ? Shuffle16bpcGather16(args)
-                                  : Shuffle16bpcGather(args); // outputs to vVertexElements ref
-                }
-            }
-            break;
-            case 32:
-            {
-                // Gathered components into place in simdvertex struct
-                for (uint32_t i = 0; i < 4; i++)
-                {
-                    if (isComponentEnabled(compMask, i))
-                    {
-                        // if we need to gather the component
-                        if (compCtrl[i] == StoreSrc)
-                        {
-                            Value* pGather = GATHERDD(gatherSrc,
-                                                      pStreamBaseGFX,
-                                                      vOffsets,
-                                                      vGatherMask,
-                                                      1,
-                                                      MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
-
-                            if (conversionType == CONVERT_USCALED)
-                            {
-                                pGather = UI_TO_FP(pGather, mSimdFP32Ty);
-                            }
-                            else if (conversionType == CONVERT_SSCALED)
-                            {
-                                pGather = SI_TO_FP(pGather, mSimdFP32Ty);
-                            }
-                            else if (conversionType == CONVERT_SFIXED)
-                            {
-                                pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
-                                               VBROADCAST(C(1 / 65536.0f)));
-                            }
-
-                            vVertexElements[currentVertexElement++] = pGather;
-
-                            // e.g. result of a single 8x32bit integer gather for 32bit components
-                            // 256i - 0    1    2    3    4    5    6    7
-                            //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
-                        }
-                        else
-                        {
-                            vVertexElements[currentVertexElement++] =
-                                GenerateCompCtrlVector(compCtrl[i]);
-                        }
-
-                        if (currentVertexElement > 3)
-                        {
-                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-
-                            // reset to the next vVertexElement to output
-                            currentVertexElement = 0;
-                        }
-                    }
-
-                    // offset base to the next component  in the vertex to gather
-                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
-                }
-            }
-            break;
-            }
-        }
-    }
-
-    // if we have a partially filled vVertexElement struct, output it
-    if (currentVertexElement > 0)
-    {
-        StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
-    }
-}
-
-
-typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData);
-
-template <typename T>
-void GetSimdValidIndicesGfx(gfxptr_t                     indices,
-                            gfxptr_t                     lastIndex,
-                            uint32_t                     vWidth,
-                            PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
-                            void*                        pdc,
-                            uint32_t*                    outIndices,
-                            void*                        pWorkerData)
-{
-    SWR_ASSERT(outIndices != nullptr);
-
-    gfxptr_t indexPtr = indices;
-    for (int64_t lane = 0; lane < vWidth; lane++)
-    {
-        uint32_t index = 0;
-
-        if (indexPtr < lastIndex)
-        {
-            // translate indexPtr and load from it
-            T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData);
-            SWR_ASSERT(addr != nullptr);
-            index = *addr;
-        }
-
-        // index to 32 bits and insert into the correct simd lane
-        outIndices[lane] = index;
-
-        indexPtr += sizeof(T);
-    }
-}
-
-void GetSimdValid8bitIndicesGfx(gfxptr_t                     indices,
-                                gfxptr_t                     lastIndex,
-                                uint32_t                     vWidth,
-                                PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
-                                void*                        pdc,
-                                uint32_t*                    outIndices,
-                                void*                        pWorkerData)
-{
-    GetSimdValidIndicesGfx<uint8_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
-}
-
-void GetSimdValid16bitIndicesGfx(gfxptr_t                     indices,
-                                 gfxptr_t                     lastIndex,
-                                 uint32_t                     vWidth,
-                                 PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
-                                 void*                        pdc,
-                                 uint32_t*                    outIndices,
-                                 void*                        pWorkerData)
-{
-    GetSimdValidIndicesGfx<uint16_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
-}
-
-
-template <typename T>
-Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
-{
-    SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
-               "Function expects gfxptr_t for both input parameters.");
-
-    Type* Ty = nullptr;
-
-    static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
-                  "Unsupported type for use with GetSimdValidIndicesHelper<T>");
-    constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
-    if (bSize)
-    {
-        Ty = mInt16PtrTy;
-    }
-    else if (sizeof(T) == sizeof(uint8_t))
-    {
-        Ty = mInt8PtrTy;
-    }
-    else
-    {
-        SWR_ASSERT(false, "This should never happen as per static_assert above.");
-    }
-
-    Value* vIndices = VUNDEF_I();
-
-    {
-        // store 0 index on stack to be used to conditionally load from if index address is OOB
-        Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
-        STORE(C((T)0), pZeroIndex);
-
-        // Load a SIMD of index pointers
-        for (int64_t lane = 0; lane < mVWidth; lane++)
-        {
-            // Calculate the address of the requested index
-            Value* pIndex = GEP(pIndices, C(lane), Ty);
-
-            pLastIndex = INT_TO_PTR(pLastIndex, Ty);
-
-            // check if the address is less than the max index,
-            Value* mask = ICMP_ULT(pIndex, pLastIndex);
-
-            // if valid, load the index. if not, load 0 from the stack
-            Value* pValid = SELECT(mask, pIndex, pZeroIndex);
-            Value* index  = LOAD(pValid, "valid index", Ty, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
-
-            // zero extended index to 32 bits and insert into the correct simd lane
-            index    = Z_EXT(index, mInt32Ty);
-            vIndices = VINSERT(vIndices, index, lane);
-        }
-    }
-
-    return vIndices;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Loads a simd of valid indices. OOB indices are set to 0
-/// *Note* have to do 8bit index checking in scalar until we have AVX-512
-/// support
-/// @param pIndices - pointer to 8 bit indices
-/// @param pLastIndex - pointer to last valid index
-Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
-{
-    return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Loads a simd of valid indices. OOB indices are set to 0
-/// *Note* have to do 16bit index checking in scalar until we have AVX-512
-/// support
-/// @param pIndices - pointer to 16 bit indices
-/// @param pLastIndex - pointer to last valid index
-Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
-{
-    return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Loads a simd of valid indices. OOB indices are set to 0
-/// @param pIndices - pointer to 32 bit indices
-/// @param pLastIndex - pointer to last valid index
-Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
-{
-    DataLayout dL(JM()->mpCurrentModule);
-    Value*     iLastIndex = pLastIndex;
-    Value*     iIndices   = pIndices;
-
-    // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
-    Value* numIndicesLeft = SUB(iLastIndex, iIndices);
-    numIndicesLeft        = TRUNC(numIndicesLeft, mInt32Ty);
-    numIndicesLeft        = SDIV(numIndicesLeft, C(4));
-
-    // create a vector of index counts from the base index ptr passed into the fetch
-    Constant* vIndexOffsets;
-    if (mVWidth == 8)
-    {
-        vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
-    }
-    else
-    {
-        vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
-    }
-
-    // compare index count to the max valid index
-    // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
-    //     vIndexOffsets  0 1 2 3 4 5 6 7
-    //     ------------------------------
-    //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
-    //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
-    Value* vMaxIndex  = VBROADCAST(numIndicesLeft);
-    Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
-
-    // Load the indices; OOB loads 0
-    return MASKED_LOAD(pIndices,
-                       4,
-                       vIndexMask,
-                       VIMMED1(0),
-                       "vIndices",
-                       PointerType::get(mSimdInt32Ty, 0),
-                       MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
-/// denormalizes if needed, converts to F32 if needed, and positions in
-//  the proper SIMD rows to be output to the simdvertex structure
-/// @param args: (tuple of args, listed below)
-///   @param vGatherResult - 8 gathered 8bpc vertices
-///   @param pVtxOut - base pointer to output simdvertex struct
-///   @param extendType - sign extend or zero extend
-///   @param bNormalized - do we need to denormalize?
-///   @param currentVertexElement - reference to the current vVertexElement
-///   @param outputElt - reference to the current offset from simdvertex we're o
-///   @param compMask - component packing mask
-///   @param compCtrl - component control val
-///   @param vVertexElements[4] - vertex components to output
-///   @param swizzle[4] - component swizzle location
-void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
-{
-    // Unpack tuple args
-    Value*&                    vGatherResult        = std::get<0>(args);
-    Value*                     pVtxOut              = std::get<1>(args);
-    const Instruction::CastOps extendType           = std::get<2>(args);
-    const ConversionType       conversionType       = std::get<3>(args);
-    uint32_t&                  currentVertexElement = std::get<4>(args);
-    uint32_t&                  outputElt            = std::get<5>(args);
-    const ComponentEnable      compMask             = std::get<6>(args);
-    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
-    Value*(&vVertexElements)[4]                     = std::get<8>(args);
-    const uint32_t(&swizzle)[4]                     = std::get<9>(args);
-
-    // cast types
-    Type* vGatherTy = getVectorType(mInt32Ty, 8);
-    Type* v32x8Ty   = getVectorType(mInt8Ty, 32);
-
-    // have to do extra work for sign extending
-    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
-    {
-        Type* v16x8Ty = getVectorType(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
-        Type* v128Ty  = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2);
-
-        // shuffle mask, including any swizzling
-        const char x          = (char)swizzle[0];
-        const char y          = (char)swizzle[1];
-        const char z          = (char)swizzle[2];
-        const char w          = (char)swizzle[3];
-        Value*     vConstMask = C<char>(
-            {char(x),     char(x + 4),  char(x + 8), char(x + 12), char(y),     char(y + 4),
-             char(y + 8), char(y + 12), char(z),     char(z + 4),  char(z + 8), char(z + 12),
-             char(w),     char(w + 4),  char(w + 8), char(w + 12), char(x),     char(x + 4),
-             char(x + 8), char(x + 12), char(y),     char(y + 4),  char(y + 8), char(y + 12),
-             char(z),     char(z + 4),  char(z + 8), char(z + 12), char(w),     char(w + 4),
-             char(w + 8), char(w + 12)});
-
-        // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
-
-        Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
-        Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
-
-        Value* vShufResult_lo =
-            BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
-        Value* vShufResult_hi =
-            BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
-
-        // after pshufb: group components together in each 128bit lane
-        // 256i - 0    1    2    3    4    5    6    7
-        //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
-
-        Value* vi128XY_lo = nullptr;
-        Value* vi128XY_hi = nullptr;
-        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
-        {
-            vi128XY_lo = BITCAST(
-                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
-                v128Ty);
-            vi128XY_hi = BITCAST(
-                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
-                v128Ty);
-
-            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
-        }
-
-        // do the same for zw components
-        Value* vi128ZW_lo = nullptr;
-        Value* vi128ZW_hi = nullptr;
-        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
-        {
-            vi128ZW_lo = BITCAST(
-                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
-                v128Ty);
-            vi128ZW_hi = BITCAST(
-                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
-                v128Ty);
-        }
-
-        // init denormalize variables if needed
-        Instruction::CastOps fpCast;
-        Value*               conversionFactor;
-
-        switch (conversionType)
-        {
-        case CONVERT_NORMALIZED:
-            fpCast           = Instruction::CastOps::SIToFP;
-            conversionFactor = VIMMED1((float)(1.0 / 127.0));
-            break;
-        case CONVERT_SSCALED:
-            fpCast           = Instruction::CastOps::SIToFP;
-            conversionFactor = VIMMED1((float)(1.0));
-            break;
-        case CONVERT_USCALED:
-            assert(false && "Type should not be sign extended!");
-            conversionFactor = nullptr;
-            break;
-        default:
-            assert(conversionType == CONVERT_NONE);
-            conversionFactor = nullptr;
-            break;
-        }
-
-        // sign extend all enabled components. If we have a fill vVertexElements, output to current
-        // simdvertex
-        for (uint32_t i = 0; i < 4; i++)
-        {
-            if (isComponentEnabled(compMask, i))
-            {
-                if (compCtrl[i] == ComponentControl::StoreSrc)
-                {
-                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-                    // if x or y, use vi128XY permute result, else use vi128ZW
-                    Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
-                    Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
-
-                    // sign extend
-                    Value* temp_lo =
-                        PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
-                    Value* temp_hi =
-                        PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
-
-                    Value* temp = JOIN_16(temp_lo, temp_hi);
-
-                    // denormalize if needed
-                    if (conversionType != CONVERT_NONE)
-                    {
-                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
-                    }
-
-                    vVertexElements[currentVertexElement] = temp;
-
-                    currentVertexElement += 1;
-                }
-                else
-                {
-                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-                }
-
-                if (currentVertexElement > 3)
-                {
-                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                    // reset to the next vVertexElement to output
-                    currentVertexElement = 0;
-                }
-            }
-        }
-    }
-    // else zero extend
-    else if ((extendType == Instruction::CastOps::ZExt) ||
-             (extendType == Instruction::CastOps::UIToFP))
-    {
-        // init denormalize variables if needed
-        Instruction::CastOps fpCast;
-        Value*               conversionFactor;
-
-        switch (conversionType)
-        {
-        case CONVERT_NORMALIZED:
-            fpCast           = Instruction::CastOps::UIToFP;
-            conversionFactor = VIMMED1((float)(1.0 / 255.0));
-            break;
-        case CONVERT_USCALED:
-            fpCast           = Instruction::CastOps::UIToFP;
-            conversionFactor = VIMMED1((float)(1.0));
-            break;
-        case CONVERT_SSCALED:
-            assert(false && "Type should not be zero extended!");
-            conversionFactor = nullptr;
-            break;
-        default:
-            assert(conversionType == CONVERT_NONE);
-            conversionFactor = nullptr;
-            break;
-        }
-
-        // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
-        for (uint32_t i = 0; i < 4; i++)
-        {
-            if (isComponentEnabled(compMask, i))
-            {
-                if (compCtrl[i] == ComponentControl::StoreSrc)
-                {
-                    // pshufb masks for each component
-                    Value* vConstMask;
-                    switch (swizzle[i])
-                    {
-                    case 0:
-                        // x shuffle mask
-                        vConstMask =
-                            C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
-                                     0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
-                        break;
-                    case 1:
-                        // y shuffle mask
-                        vConstMask =
-                            C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
-                                     1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
-                        break;
-                    case 2:
-                        // z shuffle mask
-                        vConstMask =
-                            C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
-                                     2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
-                        break;
-                    case 3:
-                        // w shuffle mask
-                        vConstMask =
-                            C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
-                                     3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
-                        break;
-                    default:
-                        assert(false && "Invalid component");
-                        vConstMask = nullptr;
-                        break;
-                    }
-
-                    Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
-                    Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
-
-                    Value* temp_lo =
-                        BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
-                    Value* temp_hi =
-                        BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
-
-                    // after pshufb for x channel
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        x000 x000 x000 x000 x000 x000 x000 x000
-
-                    Value* temp = JOIN_16(temp_lo, temp_hi);
-
-                    // denormalize if needed
-                    if (conversionType != CONVERT_NONE)
-                    {
-                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
-                    }
-
-                    vVertexElements[currentVertexElement] = temp;
-
-                    currentVertexElement += 1;
-                }
-                else
-                {
-                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-                }
-
-                if (currentVertexElement > 3)
-                {
-                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                    // reset to the next vVertexElement to output
-                    currentVertexElement = 0;
-                }
-            }
-        }
-    }
-    else
-    {
-        SWR_INVALID("Unsupported conversion type");
-    }
-}
-
-void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
-{
-    // Unpack tuple args
-    Value*&                    vGatherResult        = std::get<0>(args);
-    Value*                     pVtxOut              = std::get<1>(args);
-    const Instruction::CastOps extendType           = std::get<2>(args);
-    const ConversionType       conversionType       = std::get<3>(args);
-    uint32_t&                  currentVertexElement = std::get<4>(args);
-    uint32_t&                  outputElt            = std::get<5>(args);
-    const ComponentEnable      compMask             = std::get<6>(args);
-    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
-    Value*(&vVertexElements)[4]                     = std::get<8>(args);
-    const uint32_t(&swizzle)[4]                     = std::get<9>(args);
-
-    // cast types
-    Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
-
-    for (uint32_t i = 0; i < 4; i++)
-    {
-        if (!isComponentEnabled(compMask, i))
-            continue;
-
-        if (compCtrl[i] == ComponentControl::StoreSrc)
-        {
-#if LLVM_VERSION_MAJOR >= 11
-            using MaskType = int32_t;
-#else
-            using MaskType = uint32_t;
-#endif
-            std::vector<MaskType> vShuffleMasks[4] = {
-                {0, 4, 8, 12, 16, 20, 24, 28},  // x
-                {1, 5, 9, 13, 17, 21, 25, 29},  // y
-                {2, 6, 10, 14, 18, 22, 26, 30}, // z
-                {3, 7, 11, 15, 19, 23, 27, 31}, // w
-            };
-
-            Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
-                                  UndefValue::get(v32x8Ty),
-                                  vShuffleMasks[swizzle[i]]);
-
-            if ((extendType == Instruction::CastOps::SExt) ||
-                (extendType == Instruction::CastOps::SIToFP))
-            {
-                switch (conversionType)
-                {
-                case CONVERT_NORMALIZED:
-                    val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
-                    break;
-                case CONVERT_SSCALED:
-                    val = SI_TO_FP(val, mSimdFP32Ty);
-                    break;
-                case CONVERT_USCALED:
-                    SWR_INVALID("Type should not be sign extended!");
-                    break;
-                default:
-                    SWR_ASSERT(conversionType == CONVERT_NONE);
-                    val = S_EXT(val, mSimdInt32Ty);
-                    break;
-                }
-            }
-            else if ((extendType == Instruction::CastOps::ZExt) ||
-                     (extendType == Instruction::CastOps::UIToFP))
-            {
-                switch (conversionType)
-                {
-                case CONVERT_NORMALIZED:
-                    val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
-                    break;
-                case CONVERT_SSCALED:
-                    SWR_INVALID("Type should not be zero extended!");
-                    break;
-                case CONVERT_USCALED:
-                    val = UI_TO_FP(val, mSimdFP32Ty);
-                    break;
-                default:
-                    SWR_ASSERT(conversionType == CONVERT_NONE);
-                    val = Z_EXT(val, mSimdInt32Ty);
-                    break;
-                }
-            }
-            else
-            {
-                SWR_INVALID("Unsupported conversion type");
-            }
-
-            vVertexElements[currentVertexElement++] = val;
-        }
-        else
-        {
-            vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-        }
-
-        if (currentVertexElement > 3)
-        {
-            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-            // reset to the next vVertexElement to output
-            currentVertexElement = 0;
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
-/// denormalizes if needed, converts to F32 if needed, and positions in
-//  the proper SIMD rows to be output to the simdvertex structure
-/// @param args: (tuple of args, listed below)
-///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
-///   @param pVtxOut - base pointer to output simdvertex struct
-///   @param extendType - sign extend or zero extend
-///   @param bNormalized - do we need to denormalize?
-///   @param currentVertexElement - reference to the current vVertexElement
-///   @param outputElt - reference to the current offset from simdvertex we're o
-///   @param compMask - component packing mask
-///   @param compCtrl - component control val
-///   @param vVertexElements[4] - vertex components to output
-void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
-{
-    // Unpack tuple args
-    Value*(&vGatherResult)[2]                       = std::get<0>(args);
-    Value*                     pVtxOut              = std::get<1>(args);
-    const Instruction::CastOps extendType           = std::get<2>(args);
-    const ConversionType       conversionType       = std::get<3>(args);
-    uint32_t&                  currentVertexElement = std::get<4>(args);
-    uint32_t&                  outputElt            = std::get<5>(args);
-    const ComponentEnable      compMask             = std::get<6>(args);
-    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
-    Value*(&vVertexElements)[4]                     = std::get<8>(args);
-
-    // cast types
-    Type* vGatherTy = getVectorType(mInt32Ty, 8);
-    Type* v32x8Ty   = getVectorType(mInt8Ty, 32);
-
-    // have to do extra work for sign extending
-    if ((extendType == Instruction::CastOps::SExt) ||
-        (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
-    {
-        // is this PP float?
-        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
-
-        Type* v8x16Ty   = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane
-        Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2);
-
-        // shuffle mask
-        Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
-        Value* vi128XY_lo = nullptr;
-        Value* vi128XY_hi = nullptr;
-        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
-        {
-            // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for
-            // now..
-
-            Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
-            Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
-
-            Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
-            Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
-
-            // after pshufb: group components together in each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-
-            vi128XY_lo = BITCAST(
-                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
-                v128bitTy);
-            vi128XY_hi = BITCAST(
-                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
-                v128bitTy);
-
-            // after PERMD: move and pack xy components into each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-        }
-
-        // do the same for zw components
-        Value* vi128ZW_lo = nullptr;
-        Value* vi128ZW_hi = nullptr;
-        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
-        {
-            Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
-            Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
-
-            Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
-            Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
-
-            vi128ZW_lo = BITCAST(
-                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
-                v128bitTy);
-            vi128ZW_hi = BITCAST(
-                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
-                v128bitTy);
-        }
-
-        // init denormalize variables if needed
-        Instruction::CastOps IntToFpCast;
-        Value*               conversionFactor;
-
-        switch (conversionType)
-        {
-        case CONVERT_NORMALIZED:
-            IntToFpCast      = Instruction::CastOps::SIToFP;
-            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
-            break;
-        case CONVERT_SSCALED:
-            IntToFpCast      = Instruction::CastOps::SIToFP;
-            conversionFactor = VIMMED1((float)(1.0));
-            break;
-        case CONVERT_USCALED:
-            assert(false && "Type should not be sign extended!");
-            conversionFactor = nullptr;
-            break;
-        default:
-            assert(conversionType == CONVERT_NONE);
-            conversionFactor = nullptr;
-            break;
-        }
-
-        // sign extend all enabled components. If we have a fill vVertexElements, output to current
-        // simdvertex
-        for (uint32_t i = 0; i < 4; i++)
-        {
-            if (isComponentEnabled(compMask, i))
-            {
-                if (compCtrl[i] == ComponentControl::StoreSrc)
-                {
-                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-                    // if x or y, use vi128XY permute result, else use vi128ZW
-                    Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
-                    Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
-
-                    if (bFP)
-                    {
-                        // extract 128 bit lanes to sign extend each component
-                        Value* temp_lo =
-                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
-                        Value* temp_hi =
-                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
-
-                        vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
-                    }
-                    else
-                    {
-                        // extract 128 bit lanes to sign extend each component
-                        Value* temp_lo =
-                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
-                        Value* temp_hi =
-                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
-
-                        Value* temp = JOIN_16(temp_lo, temp_hi);
-
-                        // denormalize if needed
-                        if (conversionType != CONVERT_NONE)
-                        {
-                            temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
-                        }
-
-                        vVertexElements[currentVertexElement] = temp;
-                    }
-
-                    currentVertexElement += 1;
-                }
-                else
-                {
-                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-                }
-
-                if (currentVertexElement > 3)
-                {
-                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                    // reset to the next vVertexElement to output
-                    currentVertexElement = 0;
-                }
-            }
-        }
-    }
-    // else zero extend
-    else if ((extendType == Instruction::CastOps::ZExt) ||
-             (extendType == Instruction::CastOps::UIToFP))
-    {
-        // pshufb masks for each component
-        Value* vConstMask[2];
-
-        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
-        {
-            // x/z shuffle mask
-            vConstMask[0] = C<char>({
-                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-            });
-        }
-
-        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
-        {
-            // y/w shuffle mask
-            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
-        }
-
-        // init denormalize variables if needed
-        Instruction::CastOps fpCast;
-        Value*               conversionFactor;
-
-        switch (conversionType)
-        {
-        case CONVERT_NORMALIZED:
-            fpCast           = Instruction::CastOps::UIToFP;
-            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
-            break;
-        case CONVERT_USCALED:
-            fpCast           = Instruction::CastOps::UIToFP;
-            conversionFactor = VIMMED1((float)(1.0f));
-            break;
-        case CONVERT_SSCALED:
-            SWR_INVALID("Type should not be zero extended!");
-            conversionFactor = nullptr;
-            break;
-        default:
-            SWR_ASSERT(conversionType == CONVERT_NONE);
-            conversionFactor = nullptr;
-            break;
-        }
-
-        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
-        for (uint32_t i = 0; i < 4; i++)
-        {
-            if (isComponentEnabled(compMask, i))
-            {
-                if (compCtrl[i] == ComponentControl::StoreSrc)
-                {
-                    // select correct constMask for x/z or y/w pshufb
-                    uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
-                    // if x or y, use vi128XY permute result, else use vi128ZW
-                    uint32_t selectedGather = (i < 2) ? 0 : 1;
-
-                    // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL,
-                    // for now..
-
-                    Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
-                    Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
-
-                    Value* temp_lo = BITCAST(
-                        PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
-                        vGatherTy);
-                    Value* temp_hi = BITCAST(
-                        PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
-                        vGatherTy);
-
-                    // after pshufb mask for x channel; z uses the same shuffle from the second
-                    // gather 256i - 0    1    2    3    4    5    6    7
-                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
-
-                    Value* temp = JOIN_16(temp_lo, temp_hi);
-
-                    // denormalize if needed
-                    if (conversionType != CONVERT_NONE)
-                    {
-                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
-                    }
-
-                    vVertexElements[currentVertexElement] = temp;
-
-                    currentVertexElement += 1;
-                }
-                else
-                {
-                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-                }
-
-                if (currentVertexElement > 3)
-                {
-                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                    // reset to the next vVertexElement to output
-                    currentVertexElement = 0;
-                }
-            }
-        }
-    }
-    else
-    {
-        SWR_INVALID("Unsupported conversion type");
-    }
-}
-
-void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
-{
-    // Unpack tuple args
-    Value*(&vGatherResult)[2]                       = std::get<0>(args);
-    Value*                     pVtxOut              = std::get<1>(args);
-    const Instruction::CastOps extendType           = std::get<2>(args);
-    const ConversionType       conversionType       = std::get<3>(args);
-    uint32_t&                  currentVertexElement = std::get<4>(args);
-    uint32_t&                  outputElt            = std::get<5>(args);
-    const ComponentEnable      compMask             = std::get<6>(args);
-    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
-    Value*(&vVertexElements)[4]                     = std::get<8>(args);
-
-    // cast types
-    Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-    Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
-
-    // have to do extra work for sign extending
-    if ((extendType == Instruction::CastOps::SExt) ||
-        (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
-    {
-        // is this PP float?
-        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
-
-        Type* v8x16Ty   = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane
-        Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
-                                          mVWidth / 4); // vwidth is units of 32 bits
-
-        // shuffle mask
-        Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                                     0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
-        Value* vi128XY    = nullptr;
-        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
-        {
-            Value* vShufResult =
-                BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
-            // after pshufb: group components together in each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-
-            vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-            // after PERMD: move and pack xy components into each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-        }
-
-        // do the same for zw components
-        Value* vi128ZW = nullptr;
-        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
-        {
-            Value* vShufResult =
-                BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
-            vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-        }
-
-        // init denormalize variables if needed
-        Instruction::CastOps IntToFpCast;
-        Value*               conversionFactor;
-
-        switch (conversionType)
-        {
-        case CONVERT_NORMALIZED:
-            IntToFpCast      = Instruction::CastOps::SIToFP;
-            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
-            break;
-        case CONVERT_SSCALED:
-            IntToFpCast      = Instruction::CastOps::SIToFP;
-            conversionFactor = VIMMED1((float)(1.0));
-            break;
-        case CONVERT_USCALED:
-            SWR_INVALID("Type should not be sign extended!");
-            conversionFactor = nullptr;
-            break;
-        default:
-            SWR_ASSERT(conversionType == CONVERT_NONE);
-            conversionFactor = nullptr;
-            break;
-        }
-
-        // sign extend all enabled components. If we have a fill vVertexElements, output to current
-        // simdvertex
-        for (uint32_t i = 0; i < 4; i++)
-        {
-            if (isComponentEnabled(compMask, i))
-            {
-                if (compCtrl[i] == ComponentControl::StoreSrc)
-                {
-                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-                    // if x or y, use vi128XY permute result, else use vi128ZW
-                    Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
-                    if (bFP)
-                    {
-                        // extract 128 bit lanes to sign extend each component
-                        vVertexElements[currentVertexElement] =
-                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
-                    }
-                    else
-                    {
-                        // extract 128 bit lanes to sign extend each component
-                        vVertexElements[currentVertexElement] =
-                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
-
-                        // denormalize if needed
-                        if (conversionType != CONVERT_NONE)
-                        {
-                            vVertexElements[currentVertexElement] =
-                                FMUL(CAST(IntToFpCast,
-                                          vVertexElements[currentVertexElement],
-                                          mSimdFP32Ty),
-                                     conversionFactor);
-                        }
-                    }
-                    currentVertexElement++;
-                }
-                else
-                {
-                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-                }
-
-                if (currentVertexElement > 3)
-                {
-                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                    // reset to the next vVertexElement to output
-                    currentVertexElement = 0;
-                }
-            }
-        }
-    }
-    // else zero extend
-    else if ((extendType == Instruction::CastOps::ZExt) ||
-             (extendType == Instruction::CastOps::UIToFP))
-    {
-        // pshufb masks for each component
-        Value* vConstMask[2];
-        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
-        {
-            // x/z shuffle mask
-            vConstMask[0] = C<char>({
-                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-            });
-        }
-
-        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
-        {
-            // y/w shuffle mask
-            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
-        }
-
-        // init denormalize variables if needed
-        Instruction::CastOps fpCast;
-        Value*               conversionFactor;
-
-        switch (conversionType)
-        {
-        case CONVERT_NORMALIZED:
-            fpCast           = Instruction::CastOps::UIToFP;
-            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
-            break;
-        case CONVERT_USCALED:
-            fpCast           = Instruction::CastOps::UIToFP;
-            conversionFactor = VIMMED1((float)(1.0f));
-            break;
-        case CONVERT_SSCALED:
-            SWR_INVALID("Type should not be zero extended!");
-            conversionFactor = nullptr;
-            break;
-        default:
-            SWR_ASSERT(conversionType == CONVERT_NONE);
-            conversionFactor = nullptr;
-            break;
-        }
-
-        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
-        for (uint32_t i = 0; i < 4; i++)
-        {
-            if (isComponentEnabled(compMask, i))
-            {
-                if (compCtrl[i] == ComponentControl::StoreSrc)
-                {
-                    // select correct constMask for x/z or y/w pshufb
-                    uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
-                    // if x or y, use vi128XY permute result, else use vi128ZW
-                    uint32_t selectedGather = (i < 2) ? 0 : 1;
-
-                    vVertexElements[currentVertexElement] =
-                        BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
-                                       vConstMask[selectedMask]),
-                                vGatherTy);
-                    // after pshufb mask for x channel; z uses the same shuffle from the second
-                    // gather 256i - 0    1    2    3    4    5    6    7
-                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
-
-                    // denormalize if needed
-                    if (conversionType != CONVERT_NONE)
-                    {
-                        vVertexElements[currentVertexElement] =
-                            FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
-                                 conversionFactor);
-                    }
-                    currentVertexElement++;
-                }
-                else
-                {
-                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-                }
-
-                if (currentVertexElement > 3)
-                {
-                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                    // reset to the next vVertexElement to output
-                    currentVertexElement = 0;
-                }
-            }
-        }
-    }
-    else
-    {
-        SWR_INVALID("Unsupported conversion type");
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Output a simdvertex worth of elements to the current outputElt
-/// @param pVtxOut - base address of VIN output struct
-/// @param outputElt - simdvertex offset in VIN to write to
-/// @param numEltsToStore - number of simdvertex rows to write out
-/// @param vVertexElements - LLVM Value*[] simdvertex to write out
-void FetchJit::StoreVertexElements(Value*         pVtxOut,
-                                   const uint32_t outputElt,
-                                   const uint32_t numEltsToStore,
-                                   Value* (&vVertexElements)[4])
-{
-    SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
-
-    for (uint32_t c = 0; c < numEltsToStore; ++c)
-    {
-        // STORE expects FP32 x vWidth type, just bitcast if needed
-        if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
-        {
-#if FETCH_DUMP_VERTEX
-            PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
-#endif
-            vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
-        }
-#if FETCH_DUMP_VERTEX
-        else
-        {
-            PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
-        }
-#endif
-        // outputElt * 4 = offsetting by the size of a simdvertex
-        // + c offsets to a 32bit x vWidth row within the current vertex
-        Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
-        STORE(vVertexElements[c], dest);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generates a constant vector of values based on the
-/// ComponentControl value
-/// @param ctrl - ComponentControl value
-Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
-{
-    switch (ctrl)
-    {
-    case NoStore:
-        return VUNDEF_I();
-    case Store0:
-        return VIMMED1(0);
-    case Store1Fp:
-        return VIMMED1(1.0f);
-    case Store1Int:
-        return VIMMED1(1);
-    case StoreVertexId:
-    {
-        if (mVWidth == 16)
-        {
-            Type*  pSimd8FPTy = getVectorType(mFP32Ty, 8);
-            Value* pIdLo =
-                BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
-            Value* pIdHi =
-                BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
-            return JOIN_16(pIdLo, pIdHi);
-        }
-        else
-        {
-            return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
-        }
-    }
-    case StoreInstanceId:
-    {
-        Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
-        return VBROADCAST(pId);
-    }
-
-
-    case StoreSrc:
-    default:
-        SWR_INVALID("Invalid component control");
-        return VUNDEF_I();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Returns the enable mask for the specified component.
-/// @param enableMask - enable bits
-/// @param component - component to check if enabled.
-bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
-{
-    switch (component)
-    {
-        // X
-    case 0:
-        return (enableMask & ComponentEnable::X);
-        // Y
-    case 1:
-        return (enableMask & ComponentEnable::Y);
-        // Z
-    case 2:
-        return (enableMask & ComponentEnable::Z);
-        // W
-    case 3:
-        return (enableMask & ComponentEnable::W);
-
-    default:
-        return false;
-    }
-}
-
-// Don't want two threads compiling the same fetch shader simultaneously
-// Has problems in the JIT cache implementation
-// This is only a problem for fetch right now.
-static std::mutex gFetchCodegenMutex;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JITs from fetch shader IR
-/// @param hJitMgr - JitManager handle
-/// @param func   - LLVM function IR
-/// @return PFN_FETCH_FUNC - pointer to fetch code
-PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
-{
-    const llvm::Function* func    = (const llvm::Function*)hFunc;
-    JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-    PFN_FETCH_FUNC        pfnFetch;
-
-    gFetchCodegenMutex.lock();
-    pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
-    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
-    // add new IR to the module
-    pJitMgr->mIsModuleFinalized = true;
-
-#if defined(KNOB_SWRC_TRACING)
-    char        fName[1024];
-    const char* funcName = func->getName().data();
-    sprintf(fName, "%s.bin", funcName);
-    FILE* fd = fopen(fName, "wb");
-    fwrite((void*)pfnFetch, 1, 2048, fd);
-    fclose(fd);
-#endif
-
-    pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
-    gFetchCodegenMutex.unlock();
-
-
-    return pfnFetch;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JIT compiles fetch shader
-/// @param hJitMgr - JitManager handle
-/// @param state   - fetch state to build function from
-extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
-{
-    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-
-    pJitMgr->SetupNewModule();
-
-    FetchJit theJit(pJitMgr);
-    HANDLE   hFunc = theJit.Create(state);
-
-    return JitFetchFunc(hJitMgr, hFunc);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
deleted file mode 100644
index 9c4c6672184..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file fetch_jit.h
- *
- * @brief Definition of the fetch jitter
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/formats.h"
-#include "core/state.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// INPUT_ELEMENT_DESC
-//////////////////////////////////////////////////////////////////////////
-struct INPUT_ELEMENT_DESC
-{
-    union
-    {
-        struct
-        {
-            uint32_t AlignedByteOffset : 12;
-            uint32_t Format : 10;
-            uint32_t StreamIndex : 6;
-            uint32_t InstanceEnable : 1;
-            uint32_t InstanceStrideEnable : 1;
-            uint32_t ComponentControl0 : 4;
-            uint32_t ComponentControl1 : 4;
-            uint32_t ComponentControl2 : 4;
-            uint32_t ComponentControl3 : 4;
-            uint32_t ComponentPacking : 4;
-            uint32_t _reserved : 14;
-        };
-        uint64_t bits;
-    };
-    uint32_t InstanceAdvancementState;
-};
-
-// used to set ComponentPacking
-enum ComponentEnable
-{
-    NONE = 0x0,
-    X    = 0x1,
-    Y    = 0x2,
-    XY   = 0x3,
-    Z    = 0x4,
-    XZ   = 0x5,
-    YZ   = 0x6,
-    XYZ  = 0x7,
-    W    = 0x8,
-    XW   = 0x9,
-    YW   = 0xA,
-    XYW  = 0xB,
-    ZW   = 0xC,
-    XZW  = 0xD,
-    YZW  = 0xE,
-    XYZW = 0xF,
-};
-
-enum ComponentControl
-{
-    NoStore         = 0,
-    StoreSrc        = 1,
-    Store0          = 2,
-    Store1Fp        = 3,
-    Store1Int       = 4,
-    StoreVertexId   = 5,
-    StoreInstanceId = 6,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// State required for fetch shader jit compile.
-//////////////////////////////////////////////////////////////////////////
-struct FETCH_COMPILE_STATE
-{
-    uint32_t           numAttribs{0};
-    INPUT_ELEMENT_DESC layout[SWR_VTX_NUM_SLOTS];
-    SWR_FORMAT         indexType;
-    uint32_t           cutIndex{0xffffffff};
-
-    // Options that effect the JIT'd code
-    bool bDisableIndexOOBCheck;        // If enabled, FetchJit will exclude index OOB check
-    bool bEnableCutIndex{false};       // Compares indices with the cut index and returns a cut mask
-    bool bVertexIDOffsetEnable{false}; // Offset vertexID by StartVertex for non-indexed draws or
-                                       // BaseVertex for indexed draws
-    bool bPartialVertexBuffer{
-        false}; // for indexed draws, map illegal indices to a known resident vertex
-
-    bool bForceSequentialAccessEnable{false};
-    bool bInstanceIDOffsetEnable{false};
-
-    FETCH_COMPILE_STATE(bool disableIndexOOBCheck = false) :
-        bDisableIndexOOBCheck(disableIndexOOBCheck){};
-
-    bool operator==(const FETCH_COMPILE_STATE& other) const
-    {
-        if (numAttribs != other.numAttribs)
-            return false;
-        if (indexType != other.indexType)
-            return false;
-        if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck)
-            return false;
-        if (bEnableCutIndex != other.bEnableCutIndex)
-            return false;
-        if (cutIndex != other.cutIndex)
-            return false;
-        if (bVertexIDOffsetEnable != other.bVertexIDOffsetEnable)
-            return false;
-        if (bPartialVertexBuffer != other.bPartialVertexBuffer)
-            return false;
-        if (bForceSequentialAccessEnable != other.bForceSequentialAccessEnable)
-            return false;
-        if (bInstanceIDOffsetEnable != other.bInstanceIDOffsetEnable)
-            return false;
-
-        for (uint32_t i = 0; i < numAttribs; ++i)
-        {
-            if ((layout[i].bits != other.layout[i].bits) ||
-                (((layout[i].InstanceEnable == 1) || (layout[i].InstanceStrideEnable == 1)) &&
-                 (layout[i].InstanceAdvancementState != other.layout[i].InstanceAdvancementState)))
-            {
-                return false;
-            }
-        }
-
-        return true;
-    }
-};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
deleted file mode 100644
index 61c6b57b38b..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ /dev/null
@@ -1,962 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file lower_x86.cpp
- *
- * @brief llvm pass to lower meta code to x86
- *
- * Notes:
- *
- ******************************************************************************/
-
-#include "jit_pch.hpp"
-#include "passes.h"
-#include "JitManager.h"
-
-#include "common/simdlib.hpp"
-
-#include <unordered_map>
-
-extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);
-
-namespace llvm
-{
-    // forward declare the initializer
-    void initializeLowerX86Pass(PassRegistry&);
-} // namespace llvm
-
-namespace SwrJit
-{
-    using namespace llvm;
-
-    enum TargetArch
-    {
-        AVX    = 0,
-        AVX2   = 1,
-        AVX512 = 2
-    };
-
-    enum TargetWidth
-    {
-        W256       = 0,
-        W512       = 1,
-        NUM_WIDTHS = 2
-    };
-
-    struct LowerX86;
-
-    typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
-
-    struct X86Intrinsic
-    {
-        IntrinsicID intrin[NUM_WIDTHS];
-        EmuFunc       emuFunc;
-    };
-
-    // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
-    // previous behavior of mapping directly to avx/avx2 intrinsics.
-    using intrinsicMap_t = std::map<std::string, IntrinsicID>;
-    static intrinsicMap_t& getIntrinsicMap() {
-        static std::map<std::string, IntrinsicID> intrinsicMap = {
-            {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
-            {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
-            {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
-            {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
-            {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
-            {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
-            {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
-            {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc}
-        };
-        return intrinsicMap;
-    }
-
-    // Forward decls
-    Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-    Instruction*
-    VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-    Instruction*
-    VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-    Instruction*
-    VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-    Instruction*
-    VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-    Instruction*
-    VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-    Instruction*
-    VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-
-    Instruction* DOUBLE_EMU(LowerX86*     pThis,
-                            TargetArch    arch,
-                            TargetWidth   width,
-                            CallInst*     pCallInst,
-                            Intrinsic::ID intrin);
-
-    static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
-
-    using intrinsicMapAdvanced_t = std::vector<std::map<std::string, X86Intrinsic>>;
-
-    static intrinsicMapAdvanced_t&  getIntrinsicMapAdvanced()
-    {
-        // clang-format off
-        static intrinsicMapAdvanced_t intrinsicMapAdvanced = {
-            //                               256 wide                               512 wide
-            {
-                // AVX
-                {"meta.intrinsic.VRCPPS",    {{Intrinsic::x86_avx_rcp_ps_256,       DOUBLE},                    NO_EMU}},
-                {"meta.intrinsic.VPERMPS",   {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VPERM_EMU}},
-                {"meta.intrinsic.VPERMD",    {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VPERM_EMU}},
-                {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
-                {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
-                {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
-                {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
-                {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256,   Intrinsic::not_intrinsic},  NO_EMU}},
-                {"meta.intrinsic.VROUND",    {{Intrinsic::x86_avx_round_ps_256,     DOUBLE},                    NO_EMU}},
-                {"meta.intrinsic.VHSUBPS",   {{Intrinsic::x86_avx_hsub_ps_256,      DOUBLE},                    NO_EMU}},
-            },
-            {
-                // AVX2
-                {"meta.intrinsic.VRCPPS",       {{Intrinsic::x86_avx_rcp_ps_256,    DOUBLE},                    NO_EMU}},
-                {"meta.intrinsic.VPERMPS",      {{Intrinsic::x86_avx2_permps,       Intrinsic::not_intrinsic},  VPERM_EMU}},
-                {"meta.intrinsic.VPERMD",       {{Intrinsic::x86_avx2_permd,        Intrinsic::not_intrinsic},  VPERM_EMU}},
-                {"meta.intrinsic.VGATHERPD",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
-                {"meta.intrinsic.VGATHERPS",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
-                {"meta.intrinsic.VGATHERDD",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
-                {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
-                {"meta.intrinsic.VCVTPD2PS",    {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE},                   NO_EMU}},
-                {"meta.intrinsic.VROUND",       {{Intrinsic::x86_avx_round_ps_256,  DOUBLE},                    NO_EMU}},
-                {"meta.intrinsic.VHSUBPS",      {{Intrinsic::x86_avx_hsub_ps_256,   DOUBLE},                    NO_EMU}},
-            },
-            {
-                // AVX512
-                {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256,     Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
-    #if LLVM_VERSION_MAJOR < 7
-                {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
-                {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
-    #else
-                {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic,              Intrinsic::not_intrinsic}, VPERM_EMU}},
-                {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic,               Intrinsic::not_intrinsic}, VPERM_EMU}},
-    #endif
-                {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
-                {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
-                {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
-                {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
-    #if LLVM_VERSION_MAJOR < 7
-                {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},
-    #else
-                {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VCONVERT_EMU}},
-    #endif
-                {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic,               Intrinsic::not_intrinsic}, VROUND_EMU}},
-                {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic,              Intrinsic::not_intrinsic}, VHSUB_EMU}}
-            }};
-        // clang-format on
-        return intrinsicMapAdvanced;
-    }
-
-    static uint32_t getBitWidth(VectorType *pVTy)
-    {
-#if LLVM_VERSION_MAJOR >= 12
-        return cast<FixedVectorType>(pVTy)->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits();
-#elif LLVM_VERSION_MAJOR >= 11
-        return pVTy->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits();
-#else
-        return pVTy->getBitWidth();
-#endif
-    }
-
-    struct LowerX86 : public FunctionPass
-    {
-        LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b)
-        {
-            initializeLowerX86Pass(*PassRegistry::getPassRegistry());
-
-            // Determine target arch
-            if (JM()->mArch.AVX512F())
-            {
-                mTarget = AVX512;
-            }
-            else if (JM()->mArch.AVX2())
-            {
-                mTarget = AVX2;
-            }
-            else if (JM()->mArch.AVX())
-            {
-                mTarget = AVX;
-            }
-            else
-            {
-                SWR_ASSERT(false, "Unsupported AVX architecture.");
-                mTarget = AVX;
-            }
-
-            // Setup scatter function for 256 wide
-            uint32_t curWidth = B->mVWidth;
-            B->SetTargetWidth(8);
-            std::vector<Type*> args = {
-                B->mInt8PtrTy,   // pBase
-                B->mSimdInt32Ty, // vIndices
-                B->mSimdFP32Ty,  // vSrc
-                B->mInt8Ty,      // mask
-                B->mInt32Ty      // scale
-            };
-
-            FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);
-            mPfnScatter256             = cast<Function>(
-#if LLVM_VERSION_MAJOR >= 9
-                B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee());
-#else
-                B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));
-#endif
-            if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
-            {
-                sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);
-            }
-
-            B->SetTargetWidth(curWidth);
-        }
-
-        // Try to decipher the vector type of the instruction. This does not work properly
-        // across all intrinsics, and will have to be rethought. Probably need something
-        // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
-        // intrinsic.
-        void GetRequestedWidthAndType(CallInst*       pCallInst,
-                                      const StringRef intrinName,
-                                      TargetWidth*    pWidth,
-                                      Type**          pTy)
-        {
-            assert(pCallInst);
-            Type* pVecTy = pCallInst->getType();
-
-            // Check for intrinsic specific types
-            // VCVTPD2PS type comes from src, not dst
-            if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
-            {
-                Value* pOp = pCallInst->getOperand(0);
-                assert(pOp);
-                pVecTy = pOp->getType();
-            }
-
-            if (!pVecTy->isVectorTy())
-            {
-                for (auto& op : pCallInst->arg_operands())
-                {
-                    if (op.get()->getType()->isVectorTy())
-                    {
-                        pVecTy = op.get()->getType();
-                        break;
-                    }
-                }
-            }
-            SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
-
-            uint32_t width = getBitWidth(cast<VectorType>(pVecTy));
-            switch (width)
-            {
-            case 256:
-                *pWidth = W256;
-                break;
-            case 512:
-                *pWidth = W512;
-                break;
-            default:
-                SWR_ASSERT(false, "Unhandled vector width %d", width);
-                *pWidth = W256;
-            }
-
-            *pTy = pVecTy->getScalarType();
-        }
-
-        Value* GetZeroVec(TargetWidth width, Type* pTy)
-        {
-            uint32_t numElem = 0;
-            switch (width)
-            {
-            case W256:
-                numElem = 8;
-                break;
-            case W512:
-                numElem = 16;
-                break;
-            default:
-                SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
-            }
-
-            return ConstantVector::getNullValue(getVectorType(pTy, numElem));
-        }
-
-        Value* GetMask(TargetWidth width)
-        {
-            Value* mask;
-            switch (width)
-            {
-            case W256:
-                mask = B->C((uint8_t)-1);
-                break;
-            case W512:
-                mask = B->C((uint16_t)-1);
-                break;
-            default:
-                SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
-            }
-            return mask;
-        }
-
-        // Convert <N x i1> mask to <N x i32> x86 mask
-        Value* VectorMask(Value* vi1Mask)
-        {
-#if LLVM_VERSION_MAJOR >= 12
-            uint32_t numElem = cast<FixedVectorType>(vi1Mask->getType())->getNumElements();
-#elif LLVM_VERSION_MAJOR >= 11
-            uint32_t numElem = cast<VectorType>(vi1Mask->getType())->getNumElements();
-#else
-            uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
-#endif
-            return B->S_EXT(vi1Mask, getVectorType(B->mInt32Ty, numElem));
-        }
-
-        Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
-        {
-            Function*   pFunc = pCallInst->getCalledFunction();
-            assert(pFunc);
-
-            auto&       intrinsic = getIntrinsicMapAdvanced()[mTarget][pFunc->getName().str()];
-            TargetWidth vecWidth;
-            Type*       pElemTy;
-            GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
-
-            // Check if there is a native intrinsic for this instruction
-            IntrinsicID id = intrinsic.intrin[vecWidth];
-            if (id == DOUBLE)
-            {
-                // Double pump the next smaller SIMD intrinsic
-                SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
-                Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
-                SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
-                           "Cannot find intrinsic to double pump.");
-                return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
-            }
-            else if (id != Intrinsic::not_intrinsic)
-            {
-                Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
-                SmallVector<Value*, 8> args;
-                for (auto& arg : pCallInst->arg_operands())
-                {
-                    args.push_back(arg.get());
-                }
-
-                // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
-                // full mask for now Assuming the intrinsics are consistent and place the src
-                // operand and mask last in the argument list.
-                if (mTarget == AVX512)
-                {
-                    if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
-                    {
-                        args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
-                        args.push_back(GetMask(W256));
-                        // for AVX512 VCVTPD2PS, we also have to add rounding mode
-                        args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
-                    }
-                    else
-                    {
-                        args.push_back(GetZeroVec(vecWidth, pElemTy));
-                        args.push_back(GetMask(vecWidth));
-                    }
-                }
-
-                return B->CALLA(pIntrin, args);
-            }
-            else
-            {
-                // No native intrinsic, call emulation function
-                return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
-            }
-
-            SWR_ASSERT(false);
-            return nullptr;
-        }
-
-        Instruction* ProcessIntrinsic(CallInst* pCallInst)
-        {
-            Function* pFunc = pCallInst->getCalledFunction();
-            assert(pFunc);
-
-            // Forward to the advanced support if found
-            if (getIntrinsicMapAdvanced()[mTarget].find(pFunc->getName().str()) != getIntrinsicMapAdvanced()[mTarget].end())
-            {
-                return ProcessIntrinsicAdvanced(pCallInst);
-            }
-
-            SWR_ASSERT(getIntrinsicMap().find(pFunc->getName().str()) != getIntrinsicMap().end(),
-                       "Unimplemented intrinsic %s.",
-                       pFunc->getName().str().c_str());
-
-            Intrinsic::ID x86Intrinsic = getIntrinsicMap()[pFunc->getName().str()];
-            Function*     pX86IntrinFunc =
-                Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
-
-            SmallVector<Value*, 8> args;
-            for (auto& arg : pCallInst->arg_operands())
-            {
-                args.push_back(arg.get());
-            }
-            return B->CALLA(pX86IntrinFunc, args);
-        }
-
-        //////////////////////////////////////////////////////////////////////////
-        /// @brief LLVM function pass run method.
-        /// @param f- The function we're working on with this pass.
-        virtual bool runOnFunction(Function& F)
-        {
-            std::vector<Instruction*> toRemove;
-            std::vector<BasicBlock*>  bbs;
-
-            // Make temp copy of the basic blocks and instructions, as the intrinsic
-            // replacement code might invalidate the iterators
-            for (auto& b : F.getBasicBlockList())
-            {
-                bbs.push_back(&b);
-            }
-
-            for (auto* BB : bbs)
-            {
-                std::vector<Instruction*> insts;
-                for (auto& i : BB->getInstList())
-                {
-                    insts.push_back(&i);
-                }
-
-                for (auto* I : insts)
-                {
-                    if (CallInst* pCallInst = dyn_cast<CallInst>(I))
-                    {
-                        Function* pFunc = pCallInst->getCalledFunction();
-                        if (pFunc)
-                        {
-                            if (pFunc->getName().startswith("meta.intrinsic"))
-                            {
-                                B->IRB()->SetInsertPoint(I);
-                                Instruction* pReplace = ProcessIntrinsic(pCallInst);
-                                toRemove.push_back(pCallInst);
-                                if (pReplace)
-                                {
-                                    pCallInst->replaceAllUsesWith(pReplace);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-
-            for (auto* pInst : toRemove)
-            {
-                pInst->eraseFromParent();
-            }
-
-            JitManager::DumpToFile(&F, "lowerx86");
-
-            return true;
-        }
-
-        virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
-
-        JitManager* JM() { return B->JM(); }
-        Builder*    B;
-        TargetArch  mTarget;
-        Function*   mPfnScatter256;
-
-        static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
-    };
-
-    char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
-
-    FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); }
-
-    Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
-    {
-        SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
-        return nullptr;
-    }
-
-    Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
-    {
-        // Only need vperm emulation for AVX
-        SWR_ASSERT(arch == AVX);
-
-        Builder* B         = pThis->B;
-        auto     v32A      = pCallInst->getArgOperand(0);
-        auto     vi32Index = pCallInst->getArgOperand(1);
-
-        Value* v32Result;
-        if (isa<Constant>(vi32Index))
-        {
-            // Can use llvm shuffle vector directly with constant shuffle indices
-            v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
-        }
-        else
-        {
-            v32Result = UndefValue::get(v32A->getType());
-#if LLVM_VERSION_MAJOR >= 12
-            uint32_t numElem = cast<FixedVectorType>(v32A->getType())->getNumElements();
-#elif LLVM_VERSION_MAJOR >= 11
-            uint32_t numElem = cast<VectorType>(v32A->getType())->getNumElements();
-#else
-            uint32_t numElem = v32A->getType()->getVectorNumElements();
-#endif
-            for (uint32_t l = 0; l < numElem; ++l)
-            {
-                auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
-                auto val      = B->VEXTRACT(v32A, i32Index);
-                v32Result     = B->VINSERT(v32Result, val, B->C(l));
-            }
-        }
-        return cast<Instruction>(v32Result);
-    }
-
-    Instruction*
-    VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
-    {
-        Builder* B           = pThis->B;
-        auto     vSrc        = pCallInst->getArgOperand(0);
-        auto     pBase       = pCallInst->getArgOperand(1);
-        auto     vi32Indices = pCallInst->getArgOperand(2);
-        auto     vi1Mask     = pCallInst->getArgOperand(3);
-        auto     i8Scale     = pCallInst->getArgOperand(4);
-
-        pBase              = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
-#if LLVM_VERSION_MAJOR >= 11
-#if LLVM_VERSION_MAJOR >= 12
-        FixedVectorType* pVectorType = cast<FixedVectorType>(vSrc->getType());
-#else
-        VectorType* pVectorType = cast<VectorType>(vSrc->getType());
-#endif
-        uint32_t    numElem     = pVectorType->getNumElements();
-        auto        srcTy       = pVectorType->getElementType();
-#else
-        uint32_t numElem   = vSrc->getType()->getVectorNumElements();
-        auto     srcTy     = vSrc->getType()->getVectorElementType();
-#endif
-        auto     i32Scale  = B->Z_EXT(i8Scale, B->mInt32Ty);
-
-        Value*   v32Gather = nullptr;
-        if (arch == AVX)
-        {
-            // Full emulation for AVX
-            // Store source on stack to provide a valid address to load from inactive lanes
-            auto pStack = B->STACKSAVE();
-            auto pTmp   = B->ALLOCA(vSrc->getType());
-            B->STORE(vSrc, pTmp);
-
-            v32Gather        = UndefValue::get(vSrc->getType());
-#if LLVM_VERSION_MAJOR <= 10
-            auto vi32Scale   = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
-#elif LLVM_VERSION_MAJOR == 11
-            auto vi32Scale   = ConstantVector::getSplat(ElementCount(numElem, false), cast<ConstantInt>(i32Scale));
-#else
-            auto vi32Scale   = ConstantVector::getSplat(ElementCount::get(numElem, false), cast<ConstantInt>(i32Scale));
-#endif
-            auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
-
-            for (uint32_t i = 0; i < numElem; ++i)
-            {
-                auto i32Offset          = B->VEXTRACT(vi32Offsets, B->C(i));
-                auto pLoadAddress       = B->GEP(pBase, i32Offset);
-                pLoadAddress            = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
-                auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
-                auto i1Mask             = B->VEXTRACT(vi1Mask, B->C(i));
-                auto pValidAddress      = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
-                auto val                = B->LOAD(pValidAddress);
-                v32Gather               = B->VINSERT(v32Gather, val, B->C(i));
-            }
-
-            B->STACKRESTORE(pStack);
-        }
-        else if (arch == AVX2 || (arch == AVX512 && width == W256))
-        {
-            Function* pX86IntrinFunc = nullptr;
-            if (srcTy == B->mFP32Ty)
-            {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
-                                                           Intrinsic::x86_avx2_gather_d_ps_256);
-            }
-            else if (srcTy == B->mInt32Ty)
-            {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
-                                                           Intrinsic::x86_avx2_gather_d_d_256);
-            }
-            else if (srcTy == B->mDoubleTy)
-            {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
-                                                           Intrinsic::x86_avx2_gather_d_q_256);
-            }
-            else
-            {
-                SWR_ASSERT(false, "Unsupported vector element type for gather.");
-            }
-
-            if (width == W256)
-            {
-                auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
-                v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
-            }
-            else if (width == W512)
-            {
-                // Double pump 4-wide for 64bit elements
-#if LLVM_VERSION_MAJOR >= 12
-                if (cast<FixedVectorType>(vSrc->getType())->getElementType() == B->mDoubleTy)
-#elif LLVM_VERSION_MAJOR >= 11
-                if (cast<VectorType>(vSrc->getType())->getElementType() == B->mDoubleTy)
-#else
-                if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
-#endif
-                {
-                    auto v64Mask = pThis->VectorMask(vi1Mask);
-#if LLVM_VERSION_MAJOR >= 12
-                    uint32_t numElem = cast<FixedVectorType>(v64Mask->getType())->getNumElements();
-#elif LLVM_VERSION_MAJOR >= 11
-                    uint32_t numElem = cast<VectorType>(v64Mask->getType())->getNumElements();
-#else
-                    uint32_t numElem = v64Mask->getType()->getVectorNumElements();
-#endif
-                    v64Mask = B->S_EXT(v64Mask, getVectorType(B->mInt64Ty, numElem));
-                    v64Mask = B->BITCAST(v64Mask, vSrc->getType());
-
-                    Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
-                    Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
-
-                    Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
-                    Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
-
-                    Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
-                    Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
-
-#if LLVM_VERSION_MAJOR >= 12
-                    uint32_t numElemSrc0  = cast<FixedVectorType>(src0->getType())->getNumElements();
-                    uint32_t numElemMask0 = cast<FixedVectorType>(mask0->getType())->getNumElements();
-                    uint32_t numElemSrc1  = cast<FixedVectorType>(src1->getType())->getNumElements();
-                    uint32_t numElemMask1 = cast<FixedVectorType>(mask1->getType())->getNumElements();
-#elif LLVM_VERSION_MAJOR >= 11
-                    uint32_t numElemSrc0  = cast<VectorType>(src0->getType())->getNumElements();
-                    uint32_t numElemMask0 = cast<VectorType>(mask0->getType())->getNumElements();
-                    uint32_t numElemSrc1  = cast<VectorType>(src1->getType())->getNumElements();
-                    uint32_t numElemMask1 = cast<VectorType>(mask1->getType())->getNumElements();
-#else
-                    uint32_t numElemSrc0  = src0->getType()->getVectorNumElements();
-                    uint32_t numElemMask0 = mask0->getType()->getVectorNumElements();
-                    uint32_t numElemSrc1  = src1->getType()->getVectorNumElements();
-                    uint32_t numElemMask1 = mask1->getType()->getVectorNumElements();
-#endif
-                    src0 = B->BITCAST(src0, getVectorType(B->mInt64Ty, numElemSrc0));
-                    mask0 = B->BITCAST(mask0, getVectorType(B->mInt64Ty, numElemMask0));
-                    Value* gather0 =
-                        B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
-                    src1 = B->BITCAST(src1, getVectorType(B->mInt64Ty, numElemSrc1));
-                    mask1 = B->BITCAST(mask1, getVectorType(B->mInt64Ty, numElemMask1));
-                    Value* gather1 =
-                        B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
-                    v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
-                    v32Gather = B->BITCAST(v32Gather, vSrc->getType());
-                }
-                else
-                {
-                    // Double pump 8-wide for 32bit elements
-                    auto v32Mask = pThis->VectorMask(vi1Mask);
-                    v32Mask      = B->BITCAST(v32Mask, vSrc->getType());
-                    Value* src0  = B->EXTRACT_16(vSrc, 0);
-                    Value* src1  = B->EXTRACT_16(vSrc, 1);
-
-                    Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
-                    Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
-
-                    Value* mask0 = B->EXTRACT_16(v32Mask, 0);
-                    Value* mask1 = B->EXTRACT_16(v32Mask, 1);
-
-                    Value* gather0 =
-                        B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
-                    Value* gather1 =
-                        B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
-
-                    v32Gather = B->JOIN_16(gather0, gather1);
-                }
-            }
-        }
-        else if (arch == AVX512)
-        {
-            Value*    iMask = nullptr;
-            Function* pX86IntrinFunc = nullptr;
-            if (srcTy == B->mFP32Ty)
-            {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
-                                                           Intrinsic::x86_avx512_gather_dps_512);
-                iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
-            }
-            else if (srcTy == B->mInt32Ty)
-            {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
-                                                           Intrinsic::x86_avx512_gather_dpi_512);
-                iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
-            }
-            else if (srcTy == B->mDoubleTy)
-            {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
-                                                           Intrinsic::x86_avx512_gather_dpd_512);
-                iMask          = B->BITCAST(vi1Mask, B->mInt8Ty);
-            }
-            else
-            {
-                SWR_ASSERT(false, "Unsupported vector element type for gather.");
-            }
-
-            auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
-            v32Gather     = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
-        }
-
-        return cast<Instruction>(v32Gather);
-    }
-    Instruction*
-    VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
-    {
-        Builder* B           = pThis->B;
-        auto     pBase       = pCallInst->getArgOperand(0);
-        auto     vi1Mask     = pCallInst->getArgOperand(1);
-        auto     vi32Indices = pCallInst->getArgOperand(2);
-        auto     v32Src      = pCallInst->getArgOperand(3);
-        auto     i32Scale    = pCallInst->getArgOperand(4);
-
-        if (arch != AVX512)
-        {
-            // Call into C function to do the scatter. This has significantly better compile perf
-            // compared to jitting scatter loops for every scatter
-            if (width == W256)
-            {
-                auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);
-                B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});
-            }
-            else
-            {
-                // Need to break up 512 wide scatter to two 256 wide
-                auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
-                auto indicesLo =
-                    B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
-                auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
-
-                auto mask = B->BITCAST(maskLo, B->mInt8Ty);
-                B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});
-
-                auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
-                auto indicesHi =
-                    B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
-                auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
-
-                mask = B->BITCAST(maskHi, B->mInt8Ty);
-                B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});
-            }
-            return nullptr;
-        }
-
-        Value*    iMask;
-        Function* pX86IntrinFunc;
-        if (width == W256)
-        {
-            // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
-            // can use the scatter of 8 elements with 64bit indices
-            pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
-                                                       Intrinsic::x86_avx512_scatter_qps_512);
-
-            auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);
-            iMask               = B->BITCAST(vi1Mask, B->mInt8Ty);
-            B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});
-        }
-        else if (width == W512)
-        {
-            pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
-                                                       Intrinsic::x86_avx512_scatter_dps_512);
-            iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
-            B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});
-        }
-        return nullptr;
-    }
-
-    // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
-    // instructions
-    Instruction*
-    VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
-    {
-        SWR_ASSERT(arch == AVX512);
-
-        auto B       = pThis->B;
-        auto vf32Src = pCallInst->getOperand(0);
-        assert(vf32Src);
-        auto i8Round = pCallInst->getOperand(1);
-        assert(i8Round);
-        auto pfnFunc =
-            Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
-
-        if (width == W256)
-        {
-            return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
-        }
-        else if (width == W512)
-        {
-            auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
-            auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
-
-            auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
-            auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
-
-            return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
-        }
-        else
-        {
-            SWR_ASSERT(false, "Unimplemented vector width.");
-        }
-
-        return nullptr;
-    }
-
-    Instruction*
-    VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
-    {
-        SWR_ASSERT(arch == AVX512);
-
-        auto B       = pThis->B;
-        auto vf32Src = pCallInst->getOperand(0);
-
-        if (width == W256)
-        {
-            auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
-                                                          Intrinsic::x86_avx_round_ps_256);
-            return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
-        }
-        else if (width == W512)
-        {
-            // 512 can use intrinsic
-            auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
-                                                     Intrinsic::x86_avx512_mask_cvtpd2ps_512);
-            return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
-        }
-        else
-        {
-            SWR_ASSERT(false, "Unimplemented vector width.");
-        }
-
-        return nullptr;
-    }
-
-    // No support for hsub in AVX512
-    Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
-    {
-        SWR_ASSERT(arch == AVX512);
-
-        auto B    = pThis->B;
-        auto src0 = pCallInst->getOperand(0);
-        auto src1 = pCallInst->getOperand(1);
-
-        // 256b hsub can just use avx intrinsic
-        if (width == W256)
-        {
-            auto pX86IntrinFunc =
-                Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
-            return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
-        }
-        else if (width == W512)
-        {
-            // 512b hsub can be accomplished with shuf/sub combo
-            auto minuend    = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
-            auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
-            return cast<Instruction>(B->SUB(minuend, subtrahend));
-        }
-        else
-        {
-            SWR_ASSERT(false, "Unimplemented vector width.");
-            return nullptr;
-        }
-    }
-
-    // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
-    // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
-    Instruction* DOUBLE_EMU(LowerX86*     pThis,
-                            TargetArch    arch,
-                            TargetWidth   width,
-                            CallInst*     pCallInst,
-                            Intrinsic::ID intrin)
-    {
-        auto B = pThis->B;
-        SWR_ASSERT(width == W512);
-        Value*    result[2];
-        Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
-        for (uint32_t i = 0; i < 2; ++i)
-        {
-            SmallVector<Value*, 8> args;
-            for (auto& arg : pCallInst->arg_operands())
-            {
-                auto argType = arg.get()->getType();
-                if (argType->isVectorTy())
-                {
-#if LLVM_VERSION_MAJOR >= 12
-                    uint32_t vecWidth  = cast<FixedVectorType>(argType)->getNumElements();
-                    auto     elemTy    = cast<FixedVectorType>(argType)->getElementType();
-#elif LLVM_VERSION_MAJOR >= 11
-                    uint32_t vecWidth  = cast<VectorType>(argType)->getNumElements();
-                    auto     elemTy    = cast<VectorType>(argType)->getElementType();
-#else
-                    uint32_t vecWidth  = argType->getVectorNumElements();
-                    auto     elemTy    = argType->getVectorElementType();
-#endif
-                    Value*   lanes     = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
-                    Value*   argToPush = B->VSHUFFLE(arg.get(), B->VUNDEF(elemTy, vecWidth), lanes);
-                    args.push_back(argToPush);
-                }
-                else
-                {
-                    args.push_back(arg.get());
-                }
-            }
-            result[i] = B->CALLA(pX86IntrinFunc, args);
-        }
-        uint32_t vecWidth;
-        if (result[0]->getType()->isVectorTy())
-        {
-            assert(result[1]->getType()->isVectorTy());
-#if LLVM_VERSION_MAJOR >= 12
-            vecWidth = cast<FixedVectorType>(result[0]->getType())->getNumElements() +
-                       cast<FixedVectorType>(result[1]->getType())->getNumElements();
-#elif LLVM_VERSION_MAJOR >= 11
-            vecWidth = cast<VectorType>(result[0]->getType())->getNumElements() +
-                       cast<VectorType>(result[1]->getType())->getNumElements();
-#else
-            vecWidth = result[0]->getType()->getVectorNumElements() +
-                       result[1]->getType()->getVectorNumElements();
-#endif
-        }
-        else
-        {
-            vecWidth = 2;
-        }
-        Value* lanes = B->CInc<int>(0, vecWidth);
-        return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
-    }
-
-} // namespace SwrJit
-
-using namespace SwrJit;
-
-INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
-INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
deleted file mode 100644
index e0bb75cdec9..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file passes.h
- *
- * @brief Include file for llvm passes
- *
- ******************************************************************************/
-#pragma once
-
-#include "JitManager.h"
-#include "builder.h"
-
-namespace SwrJit
-{
-    using namespace llvm;
-
-    FunctionPass* createLowerX86Pass(Builder* b);
-} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
deleted file mode 100644
index dcb051c3b53..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file jit_api.h
- *
- * @brief Platform independent JIT interface
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-#include "common/os.h"
-#include "core/utils.h"
-
-#include "fetch_jit.h"
-#include "streamout_jit.h"
-#include "blend_jit.h"
-
-#include <stdlib.h>
-
-#if defined(_WIN32)
-#define EXCEPTION_PRINT_STACK(ret) ret
-#endif // _WIN32
-
-#if defined(_WIN32)
-#define JITCALL __stdcall
-#else
-#define JITCALL
-#endif
-
-
-struct ShaderInfo;
-
-//////////////////////////////////////////////////////////////////////////
-/// Jit Compile Info Input
-//////////////////////////////////////////////////////////////////////////
-struct JIT_COMPILE_INPUT
-{
-    SWR_SHADER_TYPE type;
-    uint32_t        crc;
-
-    const void* pIR; ///< Pointer to LLVM IR text.
-    size_t      irLength;
-
-    bool enableJitSampler;
-
-};
-
-
-extern "C" {
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Create JIT context.
-HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch, const char* core);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Destroy JIT context.
-void JITCALL JitDestroyContext(HANDLE hJitContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JIT compile shader.
-/// @param hJitContext - Jit Context
-/// @param input  - Input containing LLVM IR and other information
-/// @param output - Output containing information about JIT shader
-ShaderInfo* JITCALL JitCompileShader(HANDLE hJitContext, const JIT_COMPILE_INPUT& input);
-
-ShaderInfo* JITCALL JitGetShader(HANDLE hJitContext, const char* name);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JIT destroy shader.
-/// @param hJitContext - Jit Context
-/// @param pShaderInfo  - pointer to shader object.
-void JITCALL JitDestroyShader(HANDLE hJitContext, ShaderInfo*& pShaderInfo);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JIT compiles fetch shader
-/// @param hJitContext - Jit Context
-/// @param state   - Fetch state to build function from
-PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitContext, const FETCH_COMPILE_STATE& state);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JIT compiles streamout shader
-/// @param hJitContext - Jit Context
-/// @param state   - SO state to build function from
-PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitContext, const STREAMOUT_COMPILE_STATE& state);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JIT compiles blend shader
-/// @param hJitContext - Jit Context
-/// @param state   - blend state to build function from
-PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitContext, const BLEND_COMPILE_STATE& state);
-
-}
-
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp b/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
deleted file mode 100644
index e54e23fc904..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017-2020 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file jit_pch.hpp
- *
- * @brief Pre-compiled header for jitter
- *
- * Notes:
- *
- ******************************************************************************/
-
-#pragma once
-
-#if defined(_MSC_VER)
-#pragma warning(disable : 4146 4244 4267 4800 4996)
-#endif
-
-#include <llvm/Config/llvm-config.h>
-
-#if LLVM_VERSION_MAJOR < 7
-// llvm 3.7+ reuses "DEBUG" as an enum value
-#pragma push_macro("DEBUG")
-#undef DEBUG
-#endif
-
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
-#if LLVM_VERSION_MAJOR >= 10
-#include "llvm/IR/IntrinsicsX86.h"
-#endif
-#include "llvm/ExecutionEngine/ObjectCache.h"
-
-#include "llvm/IR/Verifier.h"
-#include "llvm/ExecutionEngine/MCJIT.h"
-#include "llvm/Support/FileSystem.h"
-#define LLVM_F_NONE sys::fs::F_None
-
-#include "llvm/Analysis/Passes.h"
-
-#include "llvm/IR/LegacyPassManager.h"
-using FunctionPassManager = llvm::legacy::FunctionPassManager;
-using PassManager         = llvm::legacy::PassManager;
-
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar.h"
-#if LLVM_VERSION_MAJOR >= 7
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#endif
-#include "llvm/Support/Host.h"
-#include "llvm/Support/DynamicLibrary.h"
-
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/LoopInfo.h"
-
-#include "llvm/Transforms/Utils/Cloning.h"
-
-#if defined(_WIN32)
-#include "llvm/ADT/Triple.h"
-#endif
-#include "llvm/IR/Function.h"
-
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SourceMgr.h"
-
-#include "llvm/Analysis/CFGPrinter.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Config/llvm-config.h"
-
-#include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-
-#if LLVM_USE_INTEL_JITEVENTS
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#endif
-
-#if LLVM_VERSION_MAJOR >= 5
-static const auto                Sync_CrossThread     = llvm::SyncScope::System;
-static const auto                Attrib_FunctionIndex = llvm::AttributeList::FunctionIndex;
-static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext&       ctx,
-                                                  const llvm::AttrBuilder& b)
-{
-    return llvm::AttributeSet::get(ctx, b);
-}
-#else
-static const auto                Sync_CrossThread     = llvm::SynchronizationScope::CrossThread;
-static const auto                Attrib_FunctionIndex = llvm::AttributeSet::FunctionIndex;
-static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext&       ctx,
-                                                  const llvm::AttrBuilder& b)
-{
-    return llvm::AttributeSet::get(ctx, Attrib_FunctionIndex, b);
-}
-#endif
-
-#if LLVM_VERSION_MAJOR >= 11
-static inline llvm::VectorType* getVectorType(llvm::Type *ElementType, unsigned NumElements)
-{
-    return llvm::VectorType::get(ElementType, NumElements, false);
-}
-#else
-static inline llvm::VectorType* getVectorType(llvm::Type *ElementType, unsigned NumElements)
-{
-    return llvm::VectorType::get(ElementType, NumElements);
-}
-#endif
-
-#if LLVM_VERSION_MAJOR < 7
-#pragma pop_macro("DEBUG")
-#endif
-
-#if LLVM_VERSION_MAJOR > 10
-    typedef unsigned            IntrinsicID;
-    typedef llvm::Align         AlignType;
-#else
-    typedef llvm::Intrinsic::ID IntrinsicID;
-    typedef unsigned            AlignType;
-#endif
-
-#include <deque>
-#include <list>
-#include <unordered_map>
-#include <unordered_set>
-#include <iostream>
-#include <sstream>
-#include <type_traits>
-#include <cstdint>
-#include <vector>
-#include <tuple>
-#include <mutex>
-
-#include "common/os.h"
-
-#if defined(_WIN32)
-#define JIT_OBJ_EXT ".obj"
-#else
-#define JIT_OBJ_EXT ".o"
-#endif // _WIN32
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/meson.build b/src/gallium/drivers/swr/rasterizer/jitter/meson.build
deleted file mode 100644
index 295dc2fccb5..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/meson.build
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright © 2017-2018 Intel Corporation
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-if dep_llvm.type_name() == 'internal'
-  _irbuilder_h = subproject('llvm').get_variable('irbuilder_h')
-else
-  _llvm_includedir = dep_llvm.get_variable(configtool : 'includedir', cmake : 'LLVM_INCLUDE_DIR')
-  _irbuilder_h = join_paths(_llvm_includedir, 'llvm', 'IR', 'IRBuilder.h')
-endif
-
-gen_builder_hpp = custom_target(
-  'gen_builder.hpp',
-  input : [
-    swr_gen_llvm_ir_macros_py, _irbuilder_h,
-  ],
-  output : 'gen_builder.hpp',
-  command : [
-    prog_python, '@INPUT0@', '--input', '@INPUT1@', '--output', '@OUTPUT@',
-    '--gen_h', '--output-dir', '@OUTDIR@'
-  ],
-  depend_files : swr_gen_builder_depends,
-  build_by_default : true,
-)
-
-gen_builder_meta_hpp = custom_target(
-  'gen_builder_meta.hpp',
-  input : '../codegen/gen_llvm_ir_macros.py',
-  output : 'gen_builder_meta.hpp',
-  command : [
-    prog_python, '@INPUT0@', '--gen_meta_h', '--output', '@OUTPUT@',
-    '--output-dir', '@OUTDIR@'
-  ],
-  depend_files : swr_gen_builder_depends,
-)
-
-gen_builder_intrin_hpp = custom_target(
-  'gen_builder_intrin.hpp',
-  input : '../codegen/gen_llvm_ir_macros.py',
-  output : 'gen_builder_intrin.hpp',
-  command : [
-    prog_python, '@INPUT0@', '--gen_intrin_h', '--output', '@OUTPUT@',
-    '--output-dir', '@OUTDIR@'
-  ],
-  depend_files : swr_gen_builder_depends,
-)
-
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp
deleted file mode 100644
index 1c9db0c2d2a..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file DebugOutput.cpp
- *
- * @brief Shader support library implementation for printed Debug output
- *
- * Notes:
- *
- ******************************************************************************/
-#include <stdarg.h>
-#include "common/os.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief called in JIT code, inserted by PRINT
-/// output to both stdout and visual studio debug console
-extern "C" void CallPrint(const char* fmt, ...)
-{
-    va_list args;
-    va_start(args, fmt);
-    vprintf(fmt, args);
-
-#if defined(_WIN32)
-    char strBuf[1024];
-    vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
-    OutputDebugStringA(strBuf);
-#endif
-
-    va_end(args);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp
deleted file mode 100644
index 925d57f5d47..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file Scatter.cpp
- *
- * @brief Shader support library implementation for scatter emulation
- *
- * Notes:
- *
- ******************************************************************************/
-#include <stdarg.h>
-#include "common/os.h"
-#include "common/simdlib.hpp"
-
-extern "C" void ScatterPS_256(uint8_t* pBase, SIMD256::Integer vIndices, SIMD256::Float vSrc, uint8_t mask, uint32_t scale)
-{
-    OSALIGN(float, 32) src[8];
-    OSALIGN(uint32_t, 32) indices[8];
-
-    SIMD256::store_ps(src, vSrc);
-    SIMD256::store_si((SIMD256::Integer*)indices, vIndices);
-
-    unsigned long index;
-    while (_BitScanForward(&index, mask))
-    {
-        mask &= ~(1 << index);
-
-        *(float*)(pBase + indices[index] * scale) = src[index];
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
deleted file mode 100644
index 72e1261a4b3..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file streamout_jit.cpp
- *
- * @brief Implementation of the streamout jitter
- *
- * Notes:
- *
- ******************************************************************************/
-#include "jit_pch.hpp"
-#include "builder_gfx_mem.h"
-#include "jit_api.h"
-#include "streamout_jit.h"
-#include "gen_state_llvm.h"
-#include "functionpasses/passes.h"
-
-using namespace llvm;
-using namespace SwrJit;
-
-//////////////////////////////////////////////////////////////////////////
-/// Interface to Jitting a fetch shader
-//////////////////////////////////////////////////////////////////////////
-struct StreamOutJit : public BuilderGfxMem
-{
-    StreamOutJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr){};
-
-    // returns pointer to SWR_STREAMOUT_BUFFER
-    Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
-    {
-        return LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer});
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // @brief checks if streamout buffer is oob
-    // @return <i1> true/false
-    Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
-    {
-        Value* returnMask = C(false);
-
-        Value* pBuf = getSOBuffer(pSoCtx, buffer);
-
-        // load enable
-        // @todo bool data types should generate <i1> llvm type
-        Value* enabled = TRUNC(LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_enable}), IRB()->getInt1Ty());
-
-        // load buffer size
-        Value* bufferSize = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_bufferSize});
-
-        // load current streamOffset
-        Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
-
-        // load buffer pitch
-        Value* pitch = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch});
-
-        // buffer is considered oob if in use in a decl but not enabled
-        returnMask = OR(returnMask, NOT(enabled));
-
-        // buffer is oob if cannot fit a prims worth of verts
-        Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
-        returnMask       = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
-
-        return returnMask;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
-    //        packing the active mask bits
-    //        ex. bitmask 0011 -> (0, 1, 0, 0)
-    //            bitmask 1000 -> (3, 0, 0, 0)
-    //            bitmask 1100 -> (2, 3, 0, 0)
-    Value* PackMask(uint32_t bitmask)
-    {
-        std::vector<Constant*> indices(4, C(0));
-        unsigned long          index;
-        uint32_t               elem = 0;
-        while (_BitScanForward(&index, bitmask))
-        {
-            indices[elem++] = C((int)index);
-            bitmask &= ~(1 << index);
-        }
-
-        return ConstantVector::get(indices);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // @brief convert scalar bitmask to <4xfloat> bitmask
-    Value* ToMask(uint32_t bitmask)
-    {
-        std::vector<Constant*> indices;
-        for (uint32_t i = 0; i < 4; ++i)
-        {
-            if (bitmask & (1 << i))
-            {
-                indices.push_back(C(true));
-            }
-            else
-            {
-                indices.push_back(C(false));
-            }
-        }
-        return ConstantVector::get(indices);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // @brief processes a single decl from the streamout stream. Reads 4 components from the input
-    //        stream and writes N components to the output buffer given the componentMask or if
-    //        a hole, just increments the buffer pointer
-    // @param pStream - pointer to current attribute
-    // @param pOutBuffers - pointers to the current location of each output buffer
-    // @param decl - input decl
-    void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
-    {
-        uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
-        uint32_t packedMask    = (1 << numComponents) - 1;
-        if (!decl.hole)
-        {
-            // increment stream pointer to correct slot
-            Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
-
-            // load 4 components from stream
-            Type* simd4Ty    = getVectorType(IRB()->getFloatTy(), 4);
-            Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
-            pAttrib          = BITCAST(pAttrib, simd4PtrTy);
-            Value* vattrib   = LOAD(pAttrib);
-
-            // shuffle/pack enabled components
-            Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
-
-            // store to output buffer
-            // cast SO buffer to i8*, needed by maskstore
-            Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(simd4Ty, 0));
-
-            // cast input to <4xfloat>
-            Value* src = BITCAST(vpackedAttrib, simd4Ty);
-
-            // cast mask to <4xi1>
-            Value* mask = ToMask(packedMask);
-            MASKED_STORE(src, pOut, 4, mask, PointerType::get(simd4Ty, 0), MEM_CLIENT::GFX_MEM_CLIENT_STREAMOUT);
-        }
-
-        // increment SO buffer
-        pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // @brief builds a single vertex worth of data for the given stream
-    // @param streamState - state for this stream
-    // @param pCurVertex - pointer to src stream vertex data
-    // @param pOutBuffer - pointers to up to 4 SO buffers
-    void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
-    {
-        for (uint32_t d = 0; d < streamState.numDecls; ++d)
-        {
-            const STREAMOUT_DECL& decl = streamState.decl[d];
-            buildDecl(pCurVertex, pOutBuffer, decl);
-        }
-    }
-
-    void buildStream(const STREAMOUT_COMPILE_STATE& state,
-                     const STREAMOUT_STREAM&        streamState,
-                     Value*                         pSoCtx,
-                     BasicBlock*                    returnBB,
-                     Function*                      soFunc)
-    {
-        // get list of active SO buffers
-        std::unordered_set<uint32_t> activeSOBuffers;
-        for (uint32_t d = 0; d < streamState.numDecls; ++d)
-        {
-            const STREAMOUT_DECL& decl = streamState.decl[d];
-            activeSOBuffers.insert(decl.bufferIndex);
-        }
-
-        // always increment numPrimStorageNeeded
-        Value* numPrimStorageNeeded = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded});
-        numPrimStorageNeeded        = ADD(numPrimStorageNeeded, C(1));
-        STORE(numPrimStorageNeeded, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded});
-
-        // check OOB on active SO buffers.  If any buffer is out of bound, don't write
-        // the primitive to any buffer
-        Value* oobMask = C(false);
-        for (uint32_t buffer : activeSOBuffers)
-        {
-            oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
-        }
-
-        BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
-
-        // early out if OOB
-        COND_BR(oobMask, returnBB, validBB);
-
-        IRB()->SetInsertPoint(validBB);
-
-        Value* numPrimsWritten = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten});
-        numPrimsWritten        = ADD(numPrimsWritten, C(1));
-        STORE(numPrimsWritten, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten});
-
-        // compute start pointer for each output buffer
-        Value* pOutBuffer[4];
-        Value* pOutBufferStartVertex[4];
-        Value* outBufferPitch[4];
-        for (uint32_t b : activeSOBuffers)
-        {
-            Value* pBuf              = getSOBuffer(pSoCtx, b);
-            Value* pData             = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pBuffer});
-            Value* streamOffset      = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
-            pOutBuffer[b] = GEP(pData, streamOffset, PointerType::get(IRB()->getInt32Ty(), 0));
-            pOutBufferStartVertex[b] = pOutBuffer[b];
-
-            outBufferPitch[b] = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch});
-        }
-
-        // loop over the vertices of the prim
-        Value* pStreamData = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pPrimData});
-        for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
-        {
-            buildVertex(streamState, pStreamData, pOutBuffer);
-
-            // increment stream and output buffer pointers
-            // stream verts are always 32*4 dwords apart
-            pStreamData = GEP(pStreamData, C(SWR_VTX_NUM_SLOTS * 4));
-
-            // output buffers offset using pitch in buffer state
-            for (uint32_t b : activeSOBuffers)
-            {
-                pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
-                pOutBuffer[b]            = pOutBufferStartVertex[b];
-            }
-        }
-
-        // update each active buffer's streamOffset
-        for (uint32_t b : activeSOBuffers)
-        {
-            Value* pBuf         = getSOBuffer(pSoCtx, b);
-            Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
-            streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
-            STORE(streamOffset, pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
-        }
-    }
-
-    Function* Create(const STREAMOUT_COMPILE_STATE& state)
-    {
-        std::stringstream fnName("SO_",
-                                 std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-        fnName << ComputeCRC(0, &state, sizeof(state));
-
-        std::vector<Type*> args{
-            mInt8PtrTy,
-            mInt8PtrTy,
-            PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
-        };
-
-        FunctionType* fTy    = FunctionType::get(IRB()->getVoidTy(), args, false);
-        Function*     soFunc = Function::Create(
-            fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
-
-        soFunc->getParent()->setModuleIdentifier(soFunc->getName());
-
-        // create return basic block
-        BasicBlock* entry    = BasicBlock::Create(JM()->mContext, "entry", soFunc);
-        BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
-
-        IRB()->SetInsertPoint(entry);
-
-        // arguments
-        auto   argitr = soFunc->arg_begin();
-
-        Value* privateContext = &*argitr++;
-        privateContext->setName("privateContext");
-        SetPrivateContext(privateContext);
-
-        mpWorkerData = &*argitr;
-        ++argitr;
-        mpWorkerData->setName("pWorkerData");
-
-        Value* pSoCtx = &*argitr++;
-        pSoCtx->setName("pSoCtx");
-
-        const STREAMOUT_STREAM& streamState = state.stream;
-        buildStream(state, streamState, pSoCtx, returnBB, soFunc);
-
-        BR(returnBB);
-
-        IRB()->SetInsertPoint(returnBB);
-        RET_VOID();
-
-        JitManager::DumpToFile(soFunc, "SoFunc");
-
-        ::FunctionPassManager passes(JM()->mpCurrentModule);
-
-        passes.add(createBreakCriticalEdgesPass());
-        passes.add(createCFGSimplificationPass());
-        passes.add(createEarlyCSEPass());
-        passes.add(createPromoteMemoryToRegisterPass());
-        passes.add(createCFGSimplificationPass());
-        passes.add(createEarlyCSEPass());
-        passes.add(createInstructionCombiningPass());
-#if LLVM_VERSION_MAJOR <= 11
-        passes.add(createConstantPropagationPass());
-#endif
-        passes.add(createSCCPPass());
-        passes.add(createAggressiveDCEPass());
-
-        passes.add(createLowerX86Pass(this));
-
-        passes.run(*soFunc);
-
-        JitManager::DumpToFile(soFunc, "SoFunc_optimized");
-
-
-        return soFunc;
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JITs from streamout shader IR
-/// @param hJitMgr - JitManager handle
-/// @param func   - LLVM function IR
-/// @return PFN_SO_FUNC - pointer to SOS function
-PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
-{
-    llvm::Function* func    = (llvm::Function*)hFunc;
-    JitManager*     pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-    PFN_SO_FUNC     pfnStreamOut;
-    pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
-    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
-    // add new IR to the module
-    pJitMgr->mIsModuleFinalized = true;
-
-    pJitMgr->DumpAsm(func, "SoFunc_optimized");
-
-
-    return pfnStreamOut;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief JIT compiles streamout shader
-/// @param hJitMgr - JitManager handle
-/// @param state   - SO state to build function from
-extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE                         hJitMgr,
-                                                   const STREAMOUT_COMPILE_STATE& state)
-{
-    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-
-    STREAMOUT_COMPILE_STATE soState = state;
-    if (soState.offsetAttribs)
-    {
-        for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
-        {
-            soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
-        }
-    }
-
-    pJitMgr->SetupNewModule();
-
-    StreamOutJit theJit(pJitMgr);
-    HANDLE       hFunc = theJit.Create(soState);
-
-    return JitStreamoutFunc(hJitMgr, hFunc);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
deleted file mode 100644
index d76fcdd5742..00000000000
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file streamout_jit.h
- *
- * @brief Definition of the streamout jitter
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/formats.h"
-#include "core/state.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// STREAMOUT_DECL - Stream decl
-//////////////////////////////////////////////////////////////////////////
-struct STREAMOUT_DECL
-{
-    // Buffer that stream maps to.
-    DWORD bufferIndex;
-
-    // attribute to stream
-    uint32_t attribSlot;
-
-    // attribute component mask
-    uint32_t componentMask;
-
-    // indicates this decl is a hole
-    bool hole;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// STREAMOUT_STREAM - Stream decls
-//////////////////////////////////////////////////////////////////////////
-struct STREAMOUT_STREAM
-{
-    // number of decls for this stream
-    uint32_t numDecls;
-
-    // array of numDecls decls
-    STREAMOUT_DECL decl[128];
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// State required for streamout jit
-//////////////////////////////////////////////////////////////////////////
-struct STREAMOUT_COMPILE_STATE
-{
-    // number of verts per primitive
-    uint32_t numVertsPerPrim;
-    uint32_t
-        offsetAttribs; ///< attrib offset to subtract from all STREAMOUT_DECL::attribSlot values.
-
-    uint64_t streamMask;
-
-    // stream decls
-    STREAMOUT_STREAM stream;
-
-    bool operator==(const STREAMOUT_COMPILE_STATE& other) const
-    {
-        if (numVertsPerPrim != other.numVertsPerPrim)
-            return false;
-        if (stream.numDecls != other.stream.numDecls)
-            return false;
-
-        for (uint32_t i = 0; i < stream.numDecls; ++i)
-        {
-            if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex)
-                return false;
-            if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot)
-                return false;
-            if (stream.decl[i].componentMask != other.stream.decl[i].componentMask)
-                return false;
-            if (stream.decl[i].hole != other.stream.decl[i].hole)
-                return false;
-        }
-
-        return true;
-    }
-};
diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
deleted file mode 100644
index 6a528b6a0f2..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file ClearTile.cpp
-*
-* @brief Functionality for ClearTile. StoreHotTileClear clears a single macro
-*        tile in the destination.
-*
-******************************************************************************/
-#include "common/os.h"
-#include "core/context.h"
-#include "common/formats.h"
-#include "memory/TilingFunctions.h"
-#include "memory/tilingtraits.h"
-#include "memory/Convert.h"
-
-typedef void(*PFN_STORE_TILES_CLEAR)(const float*, SWR_SURFACE_STATE*, UINT, UINT, uint32_t);
-
-//////////////////////////////////////////////////////////////////////////
-/// Clear Raster Tile Function Tables.
-//////////////////////////////////////////////////////////////////////////
-static PFN_STORE_TILES_CLEAR sStoreTilesClearColorTable[NUM_SWR_FORMATS];
-
-static PFN_STORE_TILES_CLEAR sStoreTilesClearDepthTable[NUM_SWR_FORMATS];
-
-//////////////////////////////////////////////////////////////////////////
-/// StoreRasterTileClear
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct StoreRasterTileClear
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pColor - Pointer to clear color.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void StoreClear(
-        const uint8_t* dstFormattedColor,
-        UINT dstBytesPerPixel,
-        SWR_SURFACE_STATE* pDstSurface,
-        UINT x, UINT y, // (x, y) pixel coordinate to start of raster tile.
-        uint32_t renderTargetArrayIndex)
-    {
-        // If we're outside of the surface, stop.
-        uint32_t lodWidth = std::max<uint32_t>(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max<uint32_t>(pDstSurface->height >> pDstSurface->lod, 1U);
-        if (x >= lodWidth || y >= lodHeight)
-            return;
-
-        // Compute destination address for raster tile.
-        uint8_t* pDstTile = (uint8_t*)ComputeSurfaceAddress<false, false>(
-                x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-                pDstSurface->arrayIndex + renderTargetArrayIndex,
-                0, // sampleNum
-                pDstSurface->lod,
-                pDstSurface);
-
-        // start of first row
-        uint8_t* pDst = pDstTile;
-        UINT dstBytesPerRow = 0;
-
-        // For each raster tile pixel in row 0 (rx, 0)
-        for (UINT rx = 0; (rx < KNOB_TILE_X_DIM) && ((x + rx) < lodWidth); ++rx)
-        {
-            memcpy(pDst, dstFormattedColor, dstBytesPerPixel);
-
-            // Increment pointer to next pixel in row.
-            pDst += dstBytesPerPixel;
-            dstBytesPerRow += dstBytesPerPixel;
-        }
-
-        // start of second row
-        pDst = pDstTile + pDstSurface->pitch;
-
-        // For each remaining row in the rest of the raster tile
-        for (UINT ry = 1; (ry < KNOB_TILE_Y_DIM) && ((y + ry) < lodHeight); ++ry)
-        {
-            // copy row
-            memcpy(pDst, pDstTile, dstBytesPerRow);
-
-            // Increment pointer to first pixel in next row.
-            pDst += pDstSurface->pitch;
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// StoreMacroTileClear - Stores a macro tile clear to its raster tiles.
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct StoreMacroTileClear
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores a macrotile to the destination surface.
-    /// @param pColor - Pointer to color to write to pixels.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to macro tile
-    static void StoreClear(
-        const float *pColor,
-        SWR_SURFACE_STATE* pDstSurface,
-        UINT x, UINT y, uint32_t renderTargetArrayIndex)
-    {
-        UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8);
-
-        uint8_t dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
-
-        float srcColor[4];
-
-        for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
-        {
-            srcColor[comp] = pColor[FormatTraits<DstFormat>::swizzle(comp)];
-        }
-
-        // using this helper function, but the Tiling Traits is unused inside it so just using a dummy value
-        ConvertPixelFromFloat<DstFormat>(dstFormattedColor, srcColor);
-
-        // Store each raster tile from the hot tile to the destination surface.
-        // TODO:  Put in check for partial coverage on x/y -- SWR_ASSERT if it happens.
-        //        Intent is for this function to only handle full tiles.
-        for (UINT row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-        {
-            for (UINT col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-            {
-                StoreRasterTileClear<SrcFormat, DstFormat>::StoreClear(dstFormattedColor, dstBytesPerPixel, pDstSurface, (x + col), (y + row), renderTargetArrayIndex);
-            }
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Writes clear color to every pixel of a render surface
-/// @param hPrivateContext - Handle to private DC
-/// @param renderTargetIndex - Index to destination render target
-/// @param x, y - Coordinates to raster tile.
-/// @param pClearColor - Pointer to clear color
-void SwrStoreHotTileClear(
-    HANDLE hWorkerPrivateData,
-    SWR_SURFACE_STATE *pDstSurface,
-    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    UINT x,
-    UINT y,
-    uint32_t renderTargetArrayIndex,
-    const float* pClearColor)
-{
-    PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL;
-
-    if (renderTargetIndex == SWR_ATTACHMENT_STENCIL)
-    {
-        SWR_ASSERT(pDstSurface->format == R8_UINT);
-        pfnStoreTilesClear = StoreMacroTileClear<R8_UINT, R8_UINT>::StoreClear;
-    }
-    else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH)
-    {
-        pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format];
-    }
-    else
-    {
-        pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format];
-    }
-
-    SWR_ASSERT(pfnStoreTilesClear != NULL);
-
-    // Store a macro tile.
-    /// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress.
-    if (pfnStoreTilesClear != NULL)
-    {
-        pfnStoreTilesClear(pClearColor, pDstSurface, x, y, renderTargetArrayIndex);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
-#define INIT_STORE_TILES_CLEAR_COLOR_TABLE() \
-    memset(sStoreTilesClearColorTable, 0, sizeof(sStoreTilesClearColorTable)); \
-    \
-    sStoreTilesClearColorTable[R32G32B32A32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[R32G32B32A32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R32G32B32A32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R32G32B32X32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[R32G32B32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[R32G32B32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R32G32B32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16A16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16A16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16A16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16A16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16A16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[R32G32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[R32G32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R32G32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16X16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16X16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[B8G8R8A8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[B8G8R8A8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[R10G10B10A2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R10G10B10A2_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[R10G10B10A2_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8A8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8A8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8A8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8A8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8A8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[B10G10R10A2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[B10G10R10A2_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[R11G11B10_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[R32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[A32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A32_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[B8G8R8X8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[B8G8R8X8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8X8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8X8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[B10G10R10X2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[B5G6R5_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[B5G6R5_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[B5G5R5A1_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[B5G5R5A1_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[B4G4R4A4_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[B4G4R4A4_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[A16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[A16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[B5G5R5X1_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[B5G5R5X1_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[R8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[A8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A8_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[BC1_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[BC2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[BC3_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[BC4_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[BC5_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[BC1_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[BC2_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[BC3_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[BC4_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[BC5_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_FLOAT>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R16G16B16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R10G10B10A2_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[R10G10B10A2_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[B10G10R10A2_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreClear; \
-    sStoreTilesClearColorTable[B10G10R10A2_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[B10G10R10A2_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UINT>::StoreClear; \
-    sStoreTilesClearColorTable[R8G8B8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SINT>::StoreClear;
-
-//////////////////////////////////////////////////////////////////////////
-/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
-#define INIT_STORE_TILES_CLEAR_DEPTH_TABLE() \
-    memset(sStoreTilesClearDepthTable, 0, sizeof(sStoreTilesClearDepthTable)); \
-    \
-    sStoreTilesClearDepthTable[R32_FLOAT] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT>::StoreClear; \
-    sStoreTilesClearDepthTable[R32_FLOAT_X8X24_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::StoreClear; \
-    sStoreTilesClearDepthTable[R24_UNORM_X8_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreClear; \
-    sStoreTilesClearDepthTable[R16_UNORM] = StoreMacroTileClear<R32_FLOAT, R16_UNORM>::StoreClear;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Sets up tables for ClearTile
-void InitSimClearTilesTable()
-{
-    INIT_STORE_TILES_CLEAR_COLOR_TABLE();
-    INIT_STORE_TILES_CLEAR_DEPTH_TABLE();
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
deleted file mode 100644
index c8c6b30daff..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/Convert.h
+++ /dev/null
@@ -1,730 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file Convert.h
-* 
-* @brief Conversion utility functions
-* 
-******************************************************************************/
-#pragma once
-
-#if defined(_MSC_VER)
-// disable "potential divide by 0"
-#pragma warning(disable: 4723)
-#endif
-
-#include <cmath>
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
-///        float
-/// @param val - 16-bit float
-/// @todo Maybe move this outside of this file into a header?
-static INLINE float ConvertSmallFloatTo32(UINT val)
-{
-    UINT result;
-    if ((val & 0x7fff) == 0)
-    {
-        result = ((uint32_t)(val & 0x8000)) << 16;
-    }
-    else if ((val & 0x7c00) == 0x7c00)
-    {
-        result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
-        result |= ((uint32_t)val & 0x8000) << 16;
-    }
-    else
-    {
-        uint32_t sign = (val & 0x8000) << 16;
-        uint32_t mant = (val & 0x3ff) << 13;
-        uint32_t exp = (val >> 10) & 0x1f;
-        if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
-        {
-            mant <<= 1;
-            while (mant < (0x400 << 13))
-            {
-                exp--;
-                mant <<= 1;
-            }
-            mant &= (0x3ff << 13);
-        }
-        exp = ((exp - 15 + 127) & 0xff) << 23;
-        result = sign | exp | mant;
-    }
-
-    return *(float*)&result;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert an IEEE 754 32-bit single precision float to an 
-///        unsigned small float with 5 exponent bits and a variable
-///        number of mantissa bits.
-/// @param val - 32-bit float
-/// @todo Maybe move this outside of this file into a header?
-template<UINT numMantissaBits>
-static UINT Convert32ToSmallFloat(float val)
-{
-    uint32_t sign, exp, mant;
-    uint32_t roundBits;
-
-    // Extract the sign, exponent, and mantissa
-    UINT uf = *(UINT*)&val;
-
-    sign = (uf & 0x80000000) >> 31;
-    exp = (uf & 0x7F800000) >> 23;
-    mant = uf & 0x007FFFFF;
-
-    // 10/11 bit floats are unsigned.  Negative values are clamped to 0.
-    if (sign != 0)
-    {
-        exp = mant = 0;
-    }
-    // Check for out of range
-    else if ((exp == 0xFF) && (mant != 0)) // NaN
-    {
-        exp = 0x1F;
-        mant = 1 << numMantissaBits;
-    }
-    else if ((exp == 0xFF) && (mant == 0)) // INF
-    {
-        exp = 0x1F;
-        mant = 0;
-    }
-    else if (exp > (0x70 + 0x1E)) // Too big to represent
-    {
-        exp = 0x1Eu;
-        mant = (1 << numMantissaBits) - 1;  // 0x3F for 6 bit mantissa.
-    }
-    else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
-    {
-        mant |= 0x00800000;
-        for (; exp <= 0x70; mant >>= 1, exp++)
-            ;
-        exp = 0;
-        mant = mant >> (23 - numMantissaBits);
-    }
-    else if (exp < 0x66) // Too small to represent -> Zero
-    {
-        exp = 0;
-        mant = 0;
-    }
-    else
-    {
-        // Saves bits that will be shifted off for rounding
-        roundBits = mant & 0x1FFFu;
-        // convert exponent and mantissa to 16 bit format
-        exp = exp - 0x70u;
-        mant = mant >> (23 - numMantissaBits);
-
-        // Essentially RTZ, but round up if off by only 1 lsb
-        if (roundBits == 0x1FFFu)
-        {
-            mant++;
-            // check for overflow
-            if ((mant & (0x3 << numMantissaBits)) != 0) // 0x60 = 0x3 << (num Mantissa Bits)
-                exp++;
-            // make sure only the needed bits are used
-            mant &= (1 << numMantissaBits) - 1;
-        }
-    }
-
-    UINT tmpVal = (exp << numMantissaBits) | mant;
-    return tmpVal;
-}
-
-#if KNOB_ARCH == KNOB_ARCH_AVX
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert an IEEE 754 32-bit single precision float to an
-///        16 bit float with 5 exponent bits and a variable
-///        number of mantissa bits.
-/// @param val - 32-bit float
-/// @todo Maybe move this outside of this file into a header?
-static uint16_t Convert32To16Float(float val)
-{
-    uint32_t sign, exp, mant;
-    uint32_t roundBits;
-
-    // Extract the sign, exponent, and mantissa
-    uint32_t uf = *(uint32_t*)&val;
-    sign = (uf & 0x80000000) >> 31;
-    exp = (uf & 0x7F800000) >> 23;
-    mant = uf & 0x007FFFFF;
-
-    // Check for out of range
-    if (std::isnan(val))
-    {
-        exp = 0x1F;
-        mant = 0x200;
-        sign = 1;                     // set the sign bit for NANs
-    }
-    else if (std::isinf(val))
-    {
-        exp = 0x1f;
-        mant = 0x0;
-    }
-    else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
-    {
-        exp = 0x1E;
-        mant = 0x3FF;
-    }
-    else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
-    {
-        mant |= 0x00800000;
-        for (; exp <= 0x70; mant >>= 1, exp++)
-            ;
-        exp = 0;
-        mant = mant >> 13;
-    }
-    else if (exp < 0x66) // Too small to represent -> Zero
-    {
-        exp = 0;
-        mant = 0;
-    }
-    else
-    {
-        // Saves bits that will be shifted off for rounding
-        roundBits = mant & 0x1FFFu;
-        // convert exponent and mantissa to 16 bit format
-        exp = exp - 0x70;
-        mant = mant >> 13;
-
-        // Essentially RTZ, but round up if off by only 1 lsb
-        if (roundBits == 0x1FFFu)
-        {
-            mant++;
-            // check for overflow
-            if ((mant & 0xC00u) != 0)
-                exp++;
-            // make sure only the needed bits are used
-            mant &= 0x3FF;
-        }
-    }
-
-    uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
-    return (uint16_t)tmpVal;
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve color from hot tile source which is always float.
-/// @param pDstPixel - Pointer to destination pixel.
-/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest).
-template<SWR_FORMAT DstFormat>
-static void ConvertPixelFromFloat(
-    uint8_t* pDstPixel,
-    const float srcPixel[4])
-{
-    uint32_t outColor[4] = { 0 };  // typeless bits
-
-    // Store component
-    for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
-    {
-        SWR_TYPE type = FormatTraits<DstFormat>::GetType(comp);
-
-        float src = srcPixel[comp];
-
-        switch (type)
-        {
-        case SWR_TYPE_UNORM:
-        {
-            // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false.
-            src = (src != src) ? 0.0f : src;
-
-            // Clamp [0, 1]
-            src = std::max(src, 0.0f);
-            src = std::min(src, 1.0f);
-
-            // SRGB
-            if (FormatTraits<DstFormat>::isSRGB && comp != 3)
-            {
-                src = (src <= 0.0031308f) ? (12.92f * src) : (1.055f * powf(src, (1.0f / 2.4f)) - 0.055f);
-            }
-
-            // Float scale to integer scale.
-            UINT scale = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1;
-            src = (float)scale * src;
-            src = roundf(src);
-            outColor[comp] = (UINT)src; // Drop fractional part.
-            break;
-        }
-        case SWR_TYPE_SNORM:
-        {
-            SWR_ASSERT(!FormatTraits<DstFormat>::isSRGB);
-
-            // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false.
-            src = (src != src) ? 0.0f : src;
-
-            // Clamp [-1, 1]
-            src = std::max(src, -1.0f);
-            src = std::min(src, 1.0f);
-
-            // Float scale to integer scale.
-            UINT scale = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
-            src = (float)scale * src;
-
-            // Round
-            src += (src >= 0) ? 0.5f : -0.5f;
-
-            INT out = (INT)src;
-
-            outColor[comp] = *(UINT*)&out;
-
-            break;
-        }
-        case SWR_TYPE_UINT:
-        {
-            ///@note The *(UINT*)& is currently necessary as the hot tile appears to always be float.
-            //       However, the number in the hot tile should be unsigned integer. So doing this
-            //       to preserve bits intead of doing a float -> integer conversion.
-            if (FormatTraits<DstFormat>::GetBPC(comp) == 32)
-            {
-                outColor[comp] = *(UINT*)&src;
-            }
-            else
-            {
-                outColor[comp] = *(UINT*)&src;
-                UINT max = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1;  // 2^numBits - 1
-
-                outColor[comp] = std::min(max, outColor[comp]);
-            }
-            break;
-        }
-        case SWR_TYPE_SINT:
-        {
-            if (FormatTraits<DstFormat>::GetBPC(comp) == 32)
-            {
-                outColor[comp] = *(UINT*)&src;
-            }
-            else
-            {
-                INT out = *(INT*)&src;  // Hot tile format is SINT?
-                INT max = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
-                INT min = -1 - max;
-
-                ///@note The output is unsigned integer (bag of bits) and so performing
-                //       the clamping here based on range of output component. Also, manually adding
-                //       the sign bit in the appropriate spot. Maybe a better way?
-                out = std::max(out, min);
-                out = std::min(out, max);
-
-                outColor[comp] = *(UINT*)&out;
-            }
-            break;
-        }
-        case SWR_TYPE_FLOAT:
-        {
-            if (FormatTraits<DstFormat>::GetBPC(comp) == 16)
-            {
-                // Convert from 32-bit float to 16-bit float using _mm_cvtps_ph
-                // @todo 16bit float instruction support is orthogonal to avx support.  need to
-                // add check for F16C support instead.
-#if KNOB_ARCH >= KNOB_ARCH_AVX2
-                __m128 src128 = _mm_set1_ps(src);
-                __m128i srci128 = _mm_cvtps_ph(src128, _MM_FROUND_TRUNC);
-                UINT value = _mm_extract_epi16(srci128, 0);
-#else
-                UINT value = Convert32To16Float(src);
-#endif
-
-                outColor[comp] = value;
-            }
-            else if (FormatTraits<DstFormat>::GetBPC(comp) == 11)
-            {
-                outColor[comp] = Convert32ToSmallFloat<6>(src);
-            }
-            else if (FormatTraits<DstFormat>::GetBPC(comp) == 10)
-            {
-                outColor[comp] = Convert32ToSmallFloat<5>(src);
-            }
-            else
-            {
-                outColor[comp] = *(UINT*)&src;
-            }
-
-            break;
-        }
-        default:
-            SWR_INVALID("Invalid type: %d", type);
-            break;
-        }
-    }
-
-    typename FormatTraits<DstFormat>::FormatT* pPixel = (typename FormatTraits<DstFormat>::FormatT*)pDstPixel;
-
-    switch (FormatTraits<DstFormat>::numComps)
-    {
-    case 4:
-        pPixel->a = outColor[3];
-    case 3:
-        pPixel->b = outColor[2];
-    case 2:
-        pPixel->g = outColor[1];
-    case 1:
-        pPixel->r = outColor[0];
-        break;
-    default:
-        SWR_INVALID("Invalid # of comps: %d", FormatTraits<DstFormat>::numComps);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert pixel in any format to float32
-/// @param pDstPixel - Pointer to destination pixel.
-/// @param srcPixel - Pointer to source pixel
-template<SWR_FORMAT SrcFormat>
-INLINE static void ConvertPixelToFloat(
-    float dstPixel[4],
-    const uint8_t* pSrc)
-{
-    uint32_t srcColor[4];  // typeless bits
-
-    // unpack src pixel
-    typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc;
-
-    // apply format defaults
-    for (uint32_t comp = 0; comp < 4; ++comp)
-    {
-        uint32_t def = FormatTraits<SrcFormat>::GetDefault(comp);
-        dstPixel[comp] = *(float*)&def;
-    }
-
-    // load format data
-    switch (FormatTraits<SrcFormat>::numComps)
-    {
-    case 4:
-        srcColor[3] = pPixel->a;
-    case 3:
-        srcColor[2] = pPixel->b;
-    case 2:
-        srcColor[1] = pPixel->g;
-    case 1:
-        srcColor[0] = pPixel->r;
-        break;
-    default:
-        SWR_INVALID("Invalid # of comps: %d", FormatTraits<SrcFormat>::numComps);
-    }
-
-    // Convert components
-    for (uint32_t comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
-    {
-        SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp);
-
-        uint32_t src = srcColor[comp];
-
-        switch (type)
-        {
-        case SWR_TYPE_UNORM:
-        {
-            float dst;
-            if (FormatTraits<SrcFormat>::isSRGB && comp != 3)
-            {
-                dst = *(float*)&srgb8Table[src];
-            }
-            else
-            {
-                // component sizes > 16 must use fp divide to maintain ulp requirements
-                if (FormatTraits<SrcFormat>::GetBPC(comp) > 16)
-                {
-                    dst = (float)src / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1);
-                }
-                else
-                {
-                    const float scale = (1.0f / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1));
-                    dst = (float)src * scale;
-                }
-            }
-            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst;
-            break;
-        }
-        case SWR_TYPE_SNORM:
-        {
-            SWR_ASSERT(!FormatTraits<SrcFormat>::isSRGB);
-
-            float dst;
-            if (src == 0x10)
-            {
-                dst = -1.0f;
-            }
-            else
-            {
-                switch (FormatTraits<SrcFormat>::GetBPC(comp))
-                {
-                case 8:
-                    dst = (float)((int8_t)src);
-                    break;
-                case 16:
-                    dst = (float)((int16_t)src);
-                    break;
-                case 32:
-                    dst = (float)((int32_t)src);
-                    break;
-                default:
-                    assert(0 && "attempted to load from SNORM with unsupported bpc");
-                    dst = 0.0f;
-                    break;
-                }
-                dst = dst * (1.0f / ((1 << (FormatTraits<SrcFormat>::GetBPC(comp) - 1)) - 1));
-            }
-            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst;
-            break;
-        }
-        case SWR_TYPE_UINT:
-        {
-            uint32_t dst = (uint32_t)src;
-            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
-            break;
-        }
-        case SWR_TYPE_SINT:
-        {
-            int dst;
-            switch (FormatTraits<SrcFormat>::GetBPC(comp))
-            {
-            case 8:
-                dst = (int8_t)src;
-                break;
-            case 16:
-                dst = (int16_t)src;
-                break;
-            case 32:
-                dst = (int32_t)src;
-                break;
-            default:
-                assert(0 && "attempted to load from SINT with unsupported bpc");
-                dst = 0;
-                break;
-            }
-            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
-            break;
-        }
-        case SWR_TYPE_FLOAT:
-        {
-            float dst;
-            if (FormatTraits<SrcFormat>::GetBPC(comp) == 16)
-            {
-#if KNOB_ARCH >= KNOB_ARCH_AVX2
-                // Convert from 16-bit float to 32-bit float using _mm_cvtph_ps
-                // @todo 16bit float instruction support is orthogonal to avx support.  need to
-                // add check for F16C support instead.
-                __m128i src128 = _mm_set1_epi32(src);
-                __m128 res = _mm_cvtph_ps(src128);
-                _mm_store_ss(&dst, res);
-#else
-                dst = ConvertSmallFloatTo32(src);
-#endif
-            }
-            else if (FormatTraits<SrcFormat>::GetBPC(comp) == 11)
-            {
-                dst = ConvertSmallFloatTo32(src << 4);
-            }
-            else if (FormatTraits<SrcFormat>::GetBPC(comp) == 10)
-            {
-                dst = ConvertSmallFloatTo32(src << 5);
-            }
-            else
-            {
-                dst = *(float*)&src;
-            }
-
-            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
-            break;
-        }
-        default:
-            SWR_INVALID("Invalid type: %d", type);
-            break;
-        }
-    }
-}
-
-// non-templated version of conversion functions
-INLINE static void ConvertPixelFromFloat(
-    SWR_FORMAT format,
-    uint8_t* pDst,
-    const float srcPixel[4])
-{
-    switch (format)
-    {
-    case R32G32B32A32_FLOAT: ConvertPixelFromFloat<R32G32B32A32_FLOAT>(pDst, srcPixel); break;
-    case R32G32B32A32_SINT: ConvertPixelFromFloat<R32G32B32A32_SINT>(pDst, srcPixel); break;
-    case R32G32B32A32_UINT: ConvertPixelFromFloat<R32G32B32A32_UINT>(pDst, srcPixel); break;
-    case R32G32B32X32_FLOAT: ConvertPixelFromFloat<R32G32B32X32_FLOAT>(pDst, srcPixel); break;
-    case R32G32B32A32_SSCALED: ConvertPixelFromFloat<R32G32B32A32_SSCALED>(pDst, srcPixel); break;
-    case R32G32B32A32_USCALED: ConvertPixelFromFloat<R32G32B32A32_USCALED>(pDst, srcPixel); break;
-    case R32G32B32_FLOAT: ConvertPixelFromFloat<R32G32B32_FLOAT>(pDst, srcPixel); break;
-    case R32G32B32_SINT: ConvertPixelFromFloat<R32G32B32_SINT>(pDst, srcPixel); break;
-    case R32G32B32_UINT: ConvertPixelFromFloat<R32G32B32_UINT>(pDst, srcPixel); break;
-    case R32G32B32_SSCALED: ConvertPixelFromFloat<R32G32B32_SSCALED>(pDst, srcPixel); break;
-    case R32G32B32_USCALED: ConvertPixelFromFloat<R32G32B32_USCALED>(pDst, srcPixel); break;
-    case R16G16B16A16_UNORM: ConvertPixelFromFloat<R16G16B16A16_UNORM>(pDst, srcPixel); break;
-    case R16G16B16A16_SNORM: ConvertPixelFromFloat<R16G16B16A16_SNORM>(pDst, srcPixel); break;
-    case R16G16B16A16_SINT: ConvertPixelFromFloat<R16G16B16A16_SINT>(pDst, srcPixel); break;
-    case R16G16B16A16_UINT: ConvertPixelFromFloat<R16G16B16A16_UINT>(pDst, srcPixel); break;
-    case R16G16B16A16_FLOAT: ConvertPixelFromFloat<R16G16B16A16_FLOAT>(pDst, srcPixel); break;
-    case R32G32_FLOAT: ConvertPixelFromFloat<R32G32_FLOAT>(pDst, srcPixel); break;
-    case R32G32_SINT: ConvertPixelFromFloat<R32G32_SINT>(pDst, srcPixel); break;
-    case R32G32_UINT: ConvertPixelFromFloat<R32G32_UINT>(pDst, srcPixel); break;
-    case R32_FLOAT_X8X24_TYPELESS: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS>(pDst, srcPixel); break;
-    case X32_TYPELESS_G8X24_UINT: ConvertPixelFromFloat<X32_TYPELESS_G8X24_UINT>(pDst, srcPixel); break;
-    case L32A32_FLOAT: ConvertPixelFromFloat<L32A32_FLOAT>(pDst, srcPixel); break;
-    case R16G16B16X16_UNORM: ConvertPixelFromFloat<R16G16B16X16_UNORM>(pDst, srcPixel); break;
-    case R16G16B16X16_FLOAT: ConvertPixelFromFloat<R16G16B16X16_FLOAT>(pDst, srcPixel); break;
-    case L32X32_FLOAT: ConvertPixelFromFloat<L32X32_FLOAT>(pDst, srcPixel); break;
-    case I32X32_FLOAT: ConvertPixelFromFloat<I32X32_FLOAT>(pDst, srcPixel); break;
-    case R16G16B16A16_SSCALED: ConvertPixelFromFloat<R16G16B16A16_SSCALED>(pDst, srcPixel); break;
-    case R16G16B16A16_USCALED: ConvertPixelFromFloat<R16G16B16A16_USCALED>(pDst, srcPixel); break;
-    case R32G32_SSCALED: ConvertPixelFromFloat<R32G32_SSCALED>(pDst, srcPixel); break;
-    case R32G32_USCALED: ConvertPixelFromFloat<R32G32_USCALED>(pDst, srcPixel); break;
-    case B8G8R8A8_UNORM: ConvertPixelFromFloat<B8G8R8A8_UNORM>(pDst, srcPixel); break;
-    case B8G8R8A8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8A8_UNORM_SRGB>(pDst, srcPixel); break;
-    case R10G10B10A2_UNORM: ConvertPixelFromFloat<R10G10B10A2_UNORM>(pDst, srcPixel); break;
-    case R10G10B10A2_UNORM_SRGB: ConvertPixelFromFloat<R10G10B10A2_UNORM_SRGB>(pDst, srcPixel); break;
-    case R10G10B10A2_UINT: ConvertPixelFromFloat<R10G10B10A2_UINT>(pDst, srcPixel); break;
-    case R8G8B8A8_UNORM: ConvertPixelFromFloat<R8G8B8A8_UNORM>(pDst, srcPixel); break;
-    case R8G8B8A8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8A8_UNORM_SRGB>(pDst, srcPixel); break;
-    case R8G8B8A8_SNORM: ConvertPixelFromFloat<R8G8B8A8_SNORM>(pDst, srcPixel); break;
-    case R8G8B8A8_SINT: ConvertPixelFromFloat<R8G8B8A8_SINT>(pDst, srcPixel); break;
-    case R8G8B8A8_UINT: ConvertPixelFromFloat<R8G8B8A8_UINT>(pDst, srcPixel); break;
-    case R16G16_UNORM: ConvertPixelFromFloat<R16G16_UNORM>(pDst, srcPixel); break;
-    case R16G16_SNORM: ConvertPixelFromFloat<R16G16_SNORM>(pDst, srcPixel); break;
-    case R16G16_SINT: ConvertPixelFromFloat<R16G16_SINT>(pDst, srcPixel); break;
-    case R16G16_UINT: ConvertPixelFromFloat<R16G16_UINT>(pDst, srcPixel); break;
-    case R16G16_FLOAT: ConvertPixelFromFloat<R16G16_FLOAT>(pDst, srcPixel); break;
-    case B10G10R10A2_UNORM: ConvertPixelFromFloat<B10G10R10A2_UNORM>(pDst, srcPixel); break;
-    case B10G10R10A2_UNORM_SRGB: ConvertPixelFromFloat<B10G10R10A2_UNORM_SRGB>(pDst, srcPixel); break;
-    case R11G11B10_FLOAT: ConvertPixelFromFloat<R11G11B10_FLOAT>(pDst, srcPixel); break;
-    case R10G10B10_FLOAT_A2_UNORM: ConvertPixelFromFloat<R10G10B10_FLOAT_A2_UNORM>(pDst, srcPixel); break;
-    case R32_SINT: ConvertPixelFromFloat<R32_SINT>(pDst, srcPixel); break;
-    case R32_UINT: ConvertPixelFromFloat<R32_UINT>(pDst, srcPixel); break;
-    case R32_FLOAT: ConvertPixelFromFloat<R32_FLOAT>(pDst, srcPixel); break;
-    case R24_UNORM_X8_TYPELESS: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS>(pDst, srcPixel); break;
-    case X24_TYPELESS_G8_UINT: ConvertPixelFromFloat<X24_TYPELESS_G8_UINT>(pDst, srcPixel); break;
-    case L32_UNORM: ConvertPixelFromFloat<L32_UNORM>(pDst, srcPixel); break;
-    case L16A16_UNORM: ConvertPixelFromFloat<L16A16_UNORM>(pDst, srcPixel); break;
-    case I24X8_UNORM: ConvertPixelFromFloat<I24X8_UNORM>(pDst, srcPixel); break;
-    case L24X8_UNORM: ConvertPixelFromFloat<L24X8_UNORM>(pDst, srcPixel); break;
-    case I32_FLOAT: ConvertPixelFromFloat<I32_FLOAT>(pDst, srcPixel); break;
-    case L32_FLOAT: ConvertPixelFromFloat<L32_FLOAT>(pDst, srcPixel); break;
-    case A32_FLOAT: ConvertPixelFromFloat<A32_FLOAT>(pDst, srcPixel); break;
-    case B8G8R8X8_UNORM: ConvertPixelFromFloat<B8G8R8X8_UNORM>(pDst, srcPixel); break;
-    case B8G8R8X8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8X8_UNORM_SRGB>(pDst, srcPixel); break;
-    case R8G8B8X8_UNORM: ConvertPixelFromFloat<R8G8B8X8_UNORM>(pDst, srcPixel); break;
-    case R8G8B8X8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8X8_UNORM_SRGB>(pDst, srcPixel); break;
-    case R9G9B9E5_SHAREDEXP: ConvertPixelFromFloat<R9G9B9E5_SHAREDEXP>(pDst, srcPixel); break;
-    case B10G10R10X2_UNORM: ConvertPixelFromFloat<B10G10R10X2_UNORM>(pDst, srcPixel); break;
-    case L16A16_FLOAT: ConvertPixelFromFloat<L16A16_FLOAT>(pDst, srcPixel); break;
-    case R10G10B10X2_USCALED: ConvertPixelFromFloat<R10G10B10X2_USCALED>(pDst, srcPixel); break;
-    case R8G8B8A8_SSCALED: ConvertPixelFromFloat<R8G8B8A8_SSCALED>(pDst, srcPixel); break;
-    case R8G8B8A8_USCALED: ConvertPixelFromFloat<R8G8B8A8_USCALED>(pDst, srcPixel); break;
-    case R16G16_SSCALED: ConvertPixelFromFloat<R16G16_SSCALED>(pDst, srcPixel); break;
-    case R16G16_USCALED: ConvertPixelFromFloat<R16G16_USCALED>(pDst, srcPixel); break;
-    case R32_SSCALED: ConvertPixelFromFloat<R32_SSCALED>(pDst, srcPixel); break;
-    case R32_USCALED: ConvertPixelFromFloat<R32_USCALED>(pDst, srcPixel); break;
-    case B5G6R5_UNORM: ConvertPixelFromFloat<B5G6R5_UNORM>(pDst, srcPixel); break;
-    case B5G6R5_UNORM_SRGB: ConvertPixelFromFloat<B5G6R5_UNORM_SRGB>(pDst, srcPixel); break;
-    case B5G5R5A1_UNORM: ConvertPixelFromFloat<B5G5R5A1_UNORM>(pDst, srcPixel); break;
-    case B5G5R5A1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5A1_UNORM_SRGB>(pDst, srcPixel); break;
-    case B4G4R4A4_UNORM: ConvertPixelFromFloat<B4G4R4A4_UNORM>(pDst, srcPixel); break;
-    case B4G4R4A4_UNORM_SRGB: ConvertPixelFromFloat<B4G4R4A4_UNORM_SRGB>(pDst, srcPixel); break;
-    case R8G8_UNORM: ConvertPixelFromFloat<R8G8_UNORM>(pDst, srcPixel); break;
-    case R8G8_SNORM: ConvertPixelFromFloat<R8G8_SNORM>(pDst, srcPixel); break;
-    case R8G8_SINT: ConvertPixelFromFloat<R8G8_SINT>(pDst, srcPixel); break;
-    case R8G8_UINT: ConvertPixelFromFloat<R8G8_UINT>(pDst, srcPixel); break;
-    case R16_UNORM: ConvertPixelFromFloat<R16_UNORM>(pDst, srcPixel); break;
-    case R16_SNORM: ConvertPixelFromFloat<R16_SNORM>(pDst, srcPixel); break;
-    case R16_SINT: ConvertPixelFromFloat<R16_SINT>(pDst, srcPixel); break;
-    case R16_UINT: ConvertPixelFromFloat<R16_UINT>(pDst, srcPixel); break;
-    case R16_FLOAT: ConvertPixelFromFloat<R16_FLOAT>(pDst, srcPixel); break;
-    case I16_UNORM: ConvertPixelFromFloat<I16_UNORM>(pDst, srcPixel); break;
-    case L16_UNORM: ConvertPixelFromFloat<L16_UNORM>(pDst, srcPixel); break;
-    case A16_UNORM: ConvertPixelFromFloat<A16_UNORM>(pDst, srcPixel); break;
-    case L8A8_UNORM: ConvertPixelFromFloat<L8A8_UNORM>(pDst, srcPixel); break;
-    case I16_FLOAT: ConvertPixelFromFloat<I16_FLOAT>(pDst, srcPixel); break;
-    case L16_FLOAT: ConvertPixelFromFloat<L16_FLOAT>(pDst, srcPixel); break;
-    case A16_FLOAT: ConvertPixelFromFloat<A16_FLOAT>(pDst, srcPixel); break;
-    case L8A8_UNORM_SRGB: ConvertPixelFromFloat<L8A8_UNORM_SRGB>(pDst, srcPixel); break;
-    case B5G5R5X1_UNORM: ConvertPixelFromFloat<B5G5R5X1_UNORM>(pDst, srcPixel); break;
-    case B5G5R5X1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5X1_UNORM_SRGB>(pDst, srcPixel); break;
-    case R8G8_SSCALED: ConvertPixelFromFloat<R8G8_SSCALED>(pDst, srcPixel); break;
-    case R8G8_USCALED: ConvertPixelFromFloat<R8G8_USCALED>(pDst, srcPixel); break;
-    case R16_SSCALED: ConvertPixelFromFloat<R16_SSCALED>(pDst, srcPixel); break;
-    case R16_USCALED: ConvertPixelFromFloat<R16_USCALED>(pDst, srcPixel); break;
-    case A1B5G5R5_UNORM: ConvertPixelFromFloat<A1B5G5R5_UNORM>(pDst, srcPixel); break;
-    case A4B4G4R4_UNORM: ConvertPixelFromFloat<A4B4G4R4_UNORM>(pDst, srcPixel); break;
-    case L8A8_UINT: ConvertPixelFromFloat<L8A8_UINT>(pDst, srcPixel); break;
-    case L8A8_SINT: ConvertPixelFromFloat<L8A8_SINT>(pDst, srcPixel); break;
-    case R8_UNORM: ConvertPixelFromFloat<R8_UNORM>(pDst, srcPixel); break;
-    case R8_SNORM: ConvertPixelFromFloat<R8_SNORM>(pDst, srcPixel); break;
-    case R8_SINT: ConvertPixelFromFloat<R8_SINT>(pDst, srcPixel); break;
-    case R8_UINT: ConvertPixelFromFloat<R8_UINT>(pDst, srcPixel); break;
-    case A8_UNORM: ConvertPixelFromFloat<A8_UNORM>(pDst, srcPixel); break;
-    case I8_UNORM: ConvertPixelFromFloat<I8_UNORM>(pDst, srcPixel); break;
-    case L8_UNORM: ConvertPixelFromFloat<L8_UNORM>(pDst, srcPixel); break;
-    case R8_SSCALED: ConvertPixelFromFloat<R8_SSCALED>(pDst, srcPixel); break;
-    case R8_USCALED: ConvertPixelFromFloat<R8_USCALED>(pDst, srcPixel); break;
-    case L8_UNORM_SRGB: ConvertPixelFromFloat<L8_UNORM_SRGB>(pDst, srcPixel); break;
-    case L8_UINT: ConvertPixelFromFloat<L8_UINT>(pDst, srcPixel); break;
-    case L8_SINT: ConvertPixelFromFloat<L8_SINT>(pDst, srcPixel); break;
-    case I8_UINT: ConvertPixelFromFloat<I8_UINT>(pDst, srcPixel); break;
-    case I8_SINT: ConvertPixelFromFloat<I8_SINT>(pDst, srcPixel); break;
-    case YCRCB_SWAPUVY: ConvertPixelFromFloat<YCRCB_SWAPUVY>(pDst, srcPixel); break;
-    case BC1_UNORM: ConvertPixelFromFloat<BC1_UNORM>(pDst, srcPixel); break;
-    case BC2_UNORM: ConvertPixelFromFloat<BC2_UNORM>(pDst, srcPixel); break;
-    case BC3_UNORM: ConvertPixelFromFloat<BC3_UNORM>(pDst, srcPixel); break;
-    case BC4_UNORM: ConvertPixelFromFloat<BC4_UNORM>(pDst, srcPixel); break;
-    case BC5_UNORM: ConvertPixelFromFloat<BC5_UNORM>(pDst, srcPixel); break;
-    case BC1_UNORM_SRGB: ConvertPixelFromFloat<BC1_UNORM_SRGB>(pDst, srcPixel); break;
-    case BC2_UNORM_SRGB: ConvertPixelFromFloat<BC2_UNORM_SRGB>(pDst, srcPixel); break;
-    case BC3_UNORM_SRGB: ConvertPixelFromFloat<BC3_UNORM_SRGB>(pDst, srcPixel); break;
-    case YCRCB_SWAPUV: ConvertPixelFromFloat<YCRCB_SWAPUV>(pDst, srcPixel); break;
-    case R8G8B8_UNORM: ConvertPixelFromFloat<R8G8B8_UNORM>(pDst, srcPixel); break;
-    case R8G8B8_SNORM: ConvertPixelFromFloat<R8G8B8_SNORM>(pDst, srcPixel); break;
-    case R8G8B8_SSCALED: ConvertPixelFromFloat<R8G8B8_SSCALED>(pDst, srcPixel); break;
-    case R8G8B8_USCALED: ConvertPixelFromFloat<R8G8B8_USCALED>(pDst, srcPixel); break;
-    case BC4_SNORM: ConvertPixelFromFloat<BC4_SNORM>(pDst, srcPixel); break;
-    case BC5_SNORM: ConvertPixelFromFloat<BC5_SNORM>(pDst, srcPixel); break;
-    case R16G16B16_FLOAT: ConvertPixelFromFloat<R16G16B16_FLOAT>(pDst, srcPixel); break;
-    case R16G16B16_UNORM: ConvertPixelFromFloat<R16G16B16_UNORM>(pDst, srcPixel); break;
-    case R16G16B16_SNORM: ConvertPixelFromFloat<R16G16B16_SNORM>(pDst, srcPixel); break;
-    case R16G16B16_SSCALED: ConvertPixelFromFloat<R16G16B16_SSCALED>(pDst, srcPixel); break;
-    case R16G16B16_USCALED: ConvertPixelFromFloat<R16G16B16_USCALED>(pDst, srcPixel); break;
-    case BC6H_SF16: ConvertPixelFromFloat<BC6H_SF16>(pDst, srcPixel); break;
-    case BC7_UNORM: ConvertPixelFromFloat<BC7_UNORM>(pDst, srcPixel); break;
-    case BC7_UNORM_SRGB: ConvertPixelFromFloat<BC7_UNORM_SRGB>(pDst, srcPixel); break;
-    case BC6H_UF16: ConvertPixelFromFloat<BC6H_UF16>(pDst, srcPixel); break;
-    case R8G8B8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8_UNORM_SRGB>(pDst, srcPixel); break;
-    case R16G16B16_UINT: ConvertPixelFromFloat<R16G16B16_UINT>(pDst, srcPixel); break;
-    case R16G16B16_SINT: ConvertPixelFromFloat<R16G16B16_SINT>(pDst, srcPixel); break;
-    case R10G10B10A2_SNORM: ConvertPixelFromFloat<R10G10B10A2_SNORM>(pDst, srcPixel); break;
-    case R10G10B10A2_USCALED: ConvertPixelFromFloat<R10G10B10A2_USCALED>(pDst, srcPixel); break;
-    case R10G10B10A2_SSCALED: ConvertPixelFromFloat<R10G10B10A2_SSCALED>(pDst, srcPixel); break;
-    case R10G10B10A2_SINT: ConvertPixelFromFloat<R10G10B10A2_SINT>(pDst, srcPixel); break;
-    case B10G10R10A2_SNORM: ConvertPixelFromFloat<B10G10R10A2_SNORM>(pDst, srcPixel); break;
-    case B10G10R10A2_USCALED: ConvertPixelFromFloat<B10G10R10A2_USCALED>(pDst, srcPixel); break;
-    case B10G10R10A2_SSCALED: ConvertPixelFromFloat<B10G10R10A2_SSCALED>(pDst, srcPixel); break;
-    case B10G10R10A2_UINT: ConvertPixelFromFloat<B10G10R10A2_UINT>(pDst, srcPixel); break;
-    case B10G10R10A2_SINT: ConvertPixelFromFloat<B10G10R10A2_SINT>(pDst, srcPixel); break;
-    case R8G8B8_UINT: ConvertPixelFromFloat<R8G8B8_UINT>(pDst, srcPixel); break;
-    case R8G8B8_SINT: ConvertPixelFromFloat<R8G8B8_SINT>(pDst, srcPixel); break;
-    case RAW: ConvertPixelFromFloat<RAW>(pDst, srcPixel); break;
-    default:
-        SWR_INVALID("Invalid format: %d", format);
-        break;
-    }
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp b/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp
deleted file mode 100644
index 3a19bbac70e..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file InitMemory.cpp
-*
-* @brief Provide access to tiles table initialization functions
-*
-******************************************************************************/
-
-#include "memory/InitMemory.h"
-#include "memory/LoadTile.h"
-#include "memory/StoreTile.h"
-#include "InitMemory.h"
-
-void InitSimLoadTilesTable();
-void InitSimStoreTilesTable();
-void InitSimClearTilesTable();
-
-void InitTilesTable()
-{
-    InitSimLoadTilesTable();
-    InitSimStoreTilesTable();
-    InitSimClearTilesTable();
-}
-
-
-void SwrGetTileIterface(SWR_TILE_INTERFACE &out_funcs)
-{
-    out_funcs.pfnSwrLoadHotTile = SwrLoadHotTile;
-    out_funcs.pfnSwrStoreHotTileToSurface = SwrStoreHotTileToSurface;
-}
-\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.h b/src/gallium/drivers/swr/rasterizer/memory/InitMemory.h
deleted file mode 100644
index a3ed7b3cbdb..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file InitMemory.h
-*
-* @brief Provide access to tiles table initialization functions
-*
-******************************************************************************/
-
-#pragma once
-
-#include "common/os.h"
-#include "memory/SurfaceState.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Loads a full hottile from a render surface
-/// @param hPrivateContext - Handle to private DC
-/// @param dstFormat - Format for hot tile.
-/// @param renderTargetIndex - Index to src render target
-/// @param x, y - Coordinates to raster tile.
-/// @param pDstHotTile - Pointer to Hot Tile
-SWR_FUNC(void,
-         SwrLoadHotTile,
-         HANDLE                      hWorkerPrivateData,
-         const SWR_SURFACE_STATE*    pSrcSurface,
-         BucketManager*              pBucketManager,
-         SWR_FORMAT                  dstFormat,
-         SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-         uint32_t                    x,
-         uint32_t                    y,
-         uint32_t                    renderTargetArrayIndex,
-         uint8_t*                    pDstHotTile);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Deswizzles and stores a full hottile to a render surface
-/// @param hPrivateContext - Handle to private DC
-/// @param srcFormat - Format for hot tile.
-/// @param renderTargetIndex - Index to destination render target
-/// @param x, y - Coordinates to raster tile.
-/// @param pSrcHotTile - Pointer to Hot Tile
-SWR_FUNC(void,
-         SwrStoreHotTileToSurface,
-         HANDLE                      hWorkerPrivateData,
-         SWR_SURFACE_STATE*          pDstSurface,
-         BucketManager*              pBucketManager,
-         SWR_FORMAT                  srcFormat,
-         SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-         uint32_t                    x,
-         uint32_t                    y,
-         uint32_t                    renderTargetArrayIndex,
-         uint8_t*                    pSrcHotTile);
-
-struct SWR_TILE_INTERFACE {
-    PFNSwrLoadHotTile           pfnSwrLoadHotTile;
-    PFNSwrStoreHotTileToSurface pfnSwrStoreHotTileToSurface;
-};
-
-extern "C"
-{
-    SWR_VISIBLE void SWR_API InitTilesTable();
-
-    typedef void(SWR_API* PFNSwrGetTileInterface)(SWR_TILE_INTERFACE& out_funcs);
-    SWR_VISIBLE void SWR_API SwrGetTileIterface(SWR_TILE_INTERFACE &out_funcs);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
deleted file mode 100644
index a26d45d130f..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file LoadTile.cpp
-* 
-* @brief Functionality for Load
-* 
-******************************************************************************/
-#include "LoadTile.h"
-
-// on demand buckets for load tiles
-static std::vector<int> sBuckets(NUM_SWR_FORMATS, -1);
-static std::mutex sBucketMutex;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Loads a full hottile from a render surface
-/// @param hPrivateContext - Handle to private DC
-/// @param dstFormat - Format for hot tile.
-/// @param renderTargetIndex - Index to src render target
-/// @param x, y - Coordinates to raster tile.
-/// @param pDstHotTile - Pointer to Hot Tile
-void SwrLoadHotTile(
-    HANDLE hWorkerPrivateData,
-    const SWR_SURFACE_STATE *pSrcSurface,
-    BucketManager* pBucketMgr,
-    SWR_FORMAT dstFormat,
-    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
-    uint8_t *pDstHotTile)
-{
-    PFN_LOAD_TILES pfnLoadTiles = NULL;
-
-    // don't need to load null surfaces
-    if (pSrcSurface->type == SURFACE_NULL)
-    {
-        return;
-    }
-
-    // force 0 if requested renderTargetArrayIndex is OOB
-    if (renderTargetArrayIndex >= pSrcSurface->depth)
-    {
-        renderTargetArrayIndex = 0;
-    }
-
-    if (renderTargetIndex < SWR_ATTACHMENT_DEPTH)
-    {
-        switch (pSrcSurface->tileMode)
-        {
-        case SWR_TILE_NONE:
-            pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_NONE[pSrcSurface->format];
-            break;
-        case SWR_TILE_MODE_YMAJOR:
-            pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format];
-            break;
-        case SWR_TILE_MODE_XMAJOR:
-            pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[pSrcSurface->format];
-            break;
-        case SWR_TILE_MODE_WMAJOR:
-            SWR_ASSERT(pSrcSurface->format == R8_UINT);
-            pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load;
-            break;
-        default:
-            SWR_INVALID("Unsupported tiling mode");
-            break;
-        }
-    }
-    else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH)
-    {
-        // Currently depth can map to linear and tile-y.
-        switch (pSrcSurface->tileMode)
-        {
-        case SWR_TILE_NONE:
-            pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_NONE[pSrcSurface->format];
-            break;
-        case SWR_TILE_MODE_YMAJOR:
-            pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format];
-            break;
-        default:
-            SWR_INVALID("Unsupported tiling mode");
-            break;
-        }
-    }
-    else
-    {
-        SWR_ASSERT(renderTargetIndex == SWR_ATTACHMENT_STENCIL);
-        SWR_ASSERT(pSrcSurface->format == R8_UINT);
-        switch (pSrcSurface->tileMode)
-        {
-        case SWR_TILE_NONE:
-            pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_NONE, 8>, R8_UINT, R8_UINT>::Load;
-            break;
-        case SWR_TILE_MODE_WMAJOR:
-            pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load;
-            break;
-        default:
-            SWR_INVALID("Unsupported tiling mode");
-            break;
-        }
-    }
-
-    if (pfnLoadTiles == nullptr)
-    {
-        SWR_INVALID("Unsupported format for load tile");
-        return;
-    }
-
-    // Load a macro tile.
-#ifdef KNOB_ENABLE_RDTSC
-    if (sBuckets[pSrcSurface->format] == -1)
-    {
-        // guard sBuckets update since storetiles is called by multiple threads
-        sBucketMutex.lock();
-        if (sBuckets[pSrcSurface->format] == -1)
-        {
-            const SWR_FORMAT_INFO& info = GetFormatInfo(pSrcSurface->format);
-            BUCKET_DESC desc{ info.name, "", false, 0xffffffff };
-            sBuckets[pSrcSurface->format] = pBucketMgr->RegisterBucket(desc);
-        }
-        sBucketMutex.unlock();
-    }
-#endif
-
-#ifdef KNOB_ENABLE_RDTSC
-    pBucketMgr->StartBucket(sBuckets[pSrcSurface->format]);
-#endif
-    pfnLoadTiles(pSrcSurface, pDstHotTile, x, y, renderTargetArrayIndex);
-#ifdef KNOB_ENABLE_RDTSC
-    pBucketMgr->StopBucket(sBuckets[pSrcSurface->format]);
-#endif
-}
-
-
-void InitSimLoadTilesTable()
-{
-    InitLoadTilesTable_Linear();
-    InitLoadTilesTable_XMajor();
-    InitLoadTilesTable_YMajor();
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h
deleted file mode 100644
index f74c3fdf4b0..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h
+++ /dev/null
@@ -1,354 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file LoadTile.h
-* 
-* @brief Functionality for Load
-* 
-******************************************************************************/
-#include "common/os.h"
-#include "common/formats.h"
-#include "core/context.h"
-#include "core/rdtsc_core.h"
-#include "memory/TilingFunctions.h"
-#include "memory/tilingtraits.h"
-#include "memory/Convert.h"
-
-typedef void(*PFN_LOAD_TILES)(const SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t);
-typedef void(*PFN_LOAD_RASTER_TILES)(const SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t, uint32_t);
-
-//////////////////////////////////////////////////////////////////////////
-/// Load Raster Tile Function Tables.
-//////////////////////////////////////////////////////////////////////////
-extern PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
-extern PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
-
-extern PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
-extern PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS];
-
-extern PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
-
-void InitLoadTilesTable_Linear();
-void InitLoadTilesTable_XMajor();
-void InitLoadTilesTable_YMajor();
-
-//////////////////////////////////////////////////////////////////////////
-/// LoadRasterTile
-//////////////////////////////////////////////////////////////////////////
-template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct LoadRasterTile
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Retrieve color from hot tile source which is always float.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param x, y - Coordinates to raster tile.
-    /// @param output - output color
-    INLINE static void SetSwizzledDstColor(
-        const float srcColor[4],
-        uint32_t x, uint32_t y,
-        uint8_t* pDst)
-    {
-        typedef SimdTile_16<DstFormat, SrcFormat> SimdT;
-
-        SimdT* pDstSimdTiles = (SimdT*)pDst;
-
-        // Compute which simd tile we're accessing within 8x8 tile.
-        //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
-        uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
-
-        SimdT* pSimdTile = &pDstSimdTiles[simdIndex];
-
-        uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
-
-        pSimdTile->SetSwizzledColor(simdOffset, srcColor);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Loads an 8x8 raster tile from the src surface.
-    /// @param pSrcSurface - Src surface state
-    /// @param pDst - Destination hot tile pointer
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Load(
-        const SWR_SURFACE_STATE* pSrcSurface,
-        uint8_t* pDst,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
-    {
-        uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod;
-        uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod;
-
-        // For each raster tile pixel (rx, ry)
-        for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
-        {
-            for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
-            {
-                if (((x + rx) < lodWidth) &&
-                    ((y + ry) < lodHeight))
-                {
-                    uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress<false, true>(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex,
-                            pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum,
-                            pSrcSurface->lod, pSrcSurface);
-
-                    float srcColor[4];
-                    ConvertPixelToFloat<SrcFormat>(srcColor, pSrc);
-
-                    // store pixel to hottile
-                    SetSwizzledDstColor(srcColor, rx, ry, pDst);
-                }
-            }
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// LoadMacroTile - Loads a macro tile which consists of raster tiles.
-//////////////////////////////////////////////////////////////////////////
-template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct LoadMacroTile
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Load a macrotile to the destination surface.
-    /// @param pSrc - Pointer to macro tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to macro tile
-    static void Load(
-        const SWR_SURFACE_STATE* pSrcSurface,
-        uint8_t *pDstHotTile,
-        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
-    {
-        PFN_LOAD_RASTER_TILES loadRasterTileFn;
-        loadRasterTileFn = LoadRasterTile<TTraits, SrcFormat, DstFormat>::Load;
-
-        // Load each raster tile from the hot tile to the destination surface.
-        for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-        {
-            for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-            {
-                for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++)
-                {
-                    loadRasterTileFn(pSrcSurface, pDstHotTile, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
-                    pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<DstFormat>::bpp / 8);
-                }
-            }
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// InitLoadTileColorTable - Helper function for setting up the tables.
-template<SWR_TILE_MODE TTileMode>
-static INLINE void InitLoadTileColorTable(PFN_LOAD_TILES (&table)[NUM_SWR_FORMATS])
-{
-    memset(table, 0, sizeof(table));
-   
-    table[R32G32B32A32_FLOAT]              = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R32G32B32A32_SINT]               = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R32G32B32A32_UINT]               = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R32G32B32X32_FLOAT]              = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R32G32B32A32_SSCALED]            = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R32G32B32A32_USCALED]            = LoadMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[R32G32B32_FLOAT]                 = LoadMacroTile<TilingTraits<TTileMode, 96>, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R32G32B32_SINT]                  = LoadMacroTile<TilingTraits<TTileMode, 96>, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R32G32B32_UINT]                  = LoadMacroTile<TilingTraits<TTileMode, 96>, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R32G32B32_SSCALED]               = LoadMacroTile<TilingTraits<TTileMode, 96>, R32G32B32_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R32G32B32_USCALED]               = LoadMacroTile<TilingTraits<TTileMode, 96>, R32G32B32_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16A16_UNORM]              = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16A16_SNORM]              = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16A16_SINT]               = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16A16_UINT]               = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16A16_FLOAT]              = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R32G32_FLOAT]                    = LoadMacroTile<TilingTraits<TTileMode, 64>, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R32G32_SINT]                     = LoadMacroTile<TilingTraits<TTileMode, 64>, R32G32_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R32G32_UINT]                     = LoadMacroTile<TilingTraits<TTileMode, 64>, R32G32_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R32_FLOAT_X8X24_TYPELESS]        = LoadMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT_X8X24_TYPELESS, R32G32B32A32_FLOAT>::Load;
-    table[X32_TYPELESS_G8X24_UINT]         = LoadMacroTile<TilingTraits<TTileMode, 64>, X32_TYPELESS_G8X24_UINT, R32G32B32A32_FLOAT>::Load;
-    table[L32A32_FLOAT]                    = LoadMacroTile<TilingTraits<TTileMode, 64>, L32A32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16X16_UNORM]              = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16X16_FLOAT]              = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[L32X32_FLOAT]                    = LoadMacroTile<TilingTraits<TTileMode, 64>, L32X32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[I32X32_FLOAT]                    = LoadMacroTile<TilingTraits<TTileMode, 64>, I32X32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16A16_SSCALED]            = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16A16_USCALED]            = LoadMacroTile<TilingTraits<TTileMode, 64>, R16G16B16A16_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[R32G32_SSCALED]                  = LoadMacroTile<TilingTraits<TTileMode, 64>, R32G32_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R32G32_USCALED]                  = LoadMacroTile<TilingTraits<TTileMode, 64>, R32G32_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[B8G8R8A8_UNORM]                  = LoadMacroTile<TilingTraits<TTileMode, 32>, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[B8G8R8A8_UNORM_SRGB]             = LoadMacroTile<TilingTraits<TTileMode, 32>, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[R10G10B10A2_UNORM]               = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R10G10B10A2_UNORM_SRGB]          = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[R10G10B10A2_UINT]                = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8A8_UNORM]                  = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8A8_UNORM_SRGB]             = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8A8_SNORM]                  = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8A8_SINT]                   = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8A8_UINT]                   = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R16G16_UNORM]                    = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R16G16_SNORM]                    = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[R16G16_SINT]                     = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R16G16_UINT]                     = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R16G16_FLOAT]                    = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[B10G10R10A2_UNORM]               = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[B10G10R10A2_UNORM_SRGB]          = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[R11G11B10_FLOAT]                 = LoadMacroTile<TilingTraits<TTileMode, 32>, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R10G10B10_FLOAT_A2_UNORM]        = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10_FLOAT_A2_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R32_SINT]                        = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R32_UINT]                        = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R32_FLOAT]                       = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R24_UNORM_X8_TYPELESS]           = LoadMacroTile<TilingTraits<TTileMode, 32>, R24_UNORM_X8_TYPELESS, R32G32B32A32_FLOAT>::Load;
-    table[X24_TYPELESS_G8_UINT]            = LoadMacroTile<TilingTraits<TTileMode, 32>, X24_TYPELESS_G8_UINT, R32G32B32A32_FLOAT>::Load;
-    table[L32_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 32>, L32_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[L16A16_UNORM]                    = LoadMacroTile<TilingTraits<TTileMode, 32>, L16A16_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[I24X8_UNORM]                     = LoadMacroTile<TilingTraits<TTileMode, 32>, I24X8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[L24X8_UNORM]                     = LoadMacroTile<TilingTraits<TTileMode, 32>, L24X8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[I32_FLOAT]                       = LoadMacroTile<TilingTraits<TTileMode, 32>, I32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[L32_FLOAT]                       = LoadMacroTile<TilingTraits<TTileMode, 32>, L32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[A32_FLOAT]                       = LoadMacroTile<TilingTraits<TTileMode, 32>, A32_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[B8G8R8X8_UNORM]                  = LoadMacroTile<TilingTraits<TTileMode, 32>, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[B8G8R8X8_UNORM_SRGB]             = LoadMacroTile<TilingTraits<TTileMode, 32>, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8X8_UNORM]                  = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8X8_UNORM_SRGB]             = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[R9G9B9E5_SHAREDEXP]              = LoadMacroTile<TilingTraits<TTileMode, 32>, R9G9B9E5_SHAREDEXP, R32G32B32A32_FLOAT>::Load;
-    table[B10G10R10X2_UNORM]               = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[L16A16_FLOAT]                    = LoadMacroTile<TilingTraits<TTileMode, 32>, L16A16_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R10G10B10X2_USCALED]             = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10X2_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8A8_SSCALED]                = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8A8_USCALED]                = LoadMacroTile<TilingTraits<TTileMode, 32>, R8G8B8A8_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[R16G16_SSCALED]                  = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R16G16_USCALED]                  = LoadMacroTile<TilingTraits<TTileMode, 32>, R16G16_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[R32_SSCALED]                     = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R32_USCALED]                     = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[B5G6R5_UNORM]                    = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[B5G6R5_UNORM_SRGB]               = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[B5G5R5A1_UNORM]                  = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[B5G5R5A1_UNORM_SRGB]             = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[B4G4R4A4_UNORM]                  = LoadMacroTile<TilingTraits<TTileMode, 16>, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[B4G4R4A4_UNORM_SRGB]             = LoadMacroTile<TilingTraits<TTileMode, 16>, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[R8G8_UNORM]                      = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R8G8_SNORM]                      = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[R8G8_SINT]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R8G8_UINT]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R16_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R16_SNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[R16_SINT]                        = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R16_UINT]                        = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R16_FLOAT]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[I16_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, I16_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[L16_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, L16_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[A16_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, A16_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[L8A8_UNORM]                      = LoadMacroTile<TilingTraits<TTileMode, 16>, L8A8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[I16_FLOAT]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, I16_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[L16_FLOAT]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, L16_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[A16_FLOAT]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, A16_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[L8A8_UNORM_SRGB]                 = LoadMacroTile<TilingTraits<TTileMode, 16>, L8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[B5G5R5X1_UNORM]                  = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[B5G5R5X1_UNORM_SRGB]             = LoadMacroTile<TilingTraits<TTileMode, 16>, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[R8G8_SSCALED]                    = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R8G8_USCALED]                    = LoadMacroTile<TilingTraits<TTileMode, 16>, R8G8_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[R16_SSCALED]                     = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R16_USCALED]                     = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[A1B5G5R5_UNORM]                  = LoadMacroTile<TilingTraits<TTileMode, 16>, A1B5G5R5_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[A4B4G4R4_UNORM]                  = LoadMacroTile<TilingTraits<TTileMode, 16>, A4B4G4R4_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[L8A8_UINT]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, L8A8_UINT, R32G32B32A32_FLOAT>::Load;
-    table[L8A8_SINT]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, L8A8_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R8_UNORM]                        = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R8_SNORM]                        = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[R8_SINT]                         = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R8_UINT]                         = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R32G32B32A32_FLOAT>::Load;
-    table[A8_UNORM]                        = LoadMacroTile<TilingTraits<TTileMode, 8>, A8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[I8_UNORM]                        = LoadMacroTile<TilingTraits<TTileMode, 8>, I8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[L8_UNORM]                        = LoadMacroTile<TilingTraits<TTileMode, 8>, L8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R8_SSCALED]                      = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R8_USCALED]                      = LoadMacroTile<TilingTraits<TTileMode, 8>, R8_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[L8_UNORM_SRGB]                   = LoadMacroTile<TilingTraits<TTileMode, 8>, L8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[L8_UINT]                         = LoadMacroTile<TilingTraits<TTileMode, 8>, L8_UINT, R32G32B32A32_FLOAT>::Load;
-    table[L8_SINT]                         = LoadMacroTile<TilingTraits<TTileMode, 8>, L8_SINT, R32G32B32A32_FLOAT>::Load;
-    table[I8_UINT]                         = LoadMacroTile<TilingTraits<TTileMode, 8>, I8_UINT, R32G32B32A32_FLOAT>::Load;
-    table[I8_SINT]                         = LoadMacroTile<TilingTraits<TTileMode, 8>, I8_SINT, R32G32B32A32_FLOAT>::Load;
-    table[YCRCB_SWAPUVY]                   = LoadMacroTile<TilingTraits<TTileMode, 32>, YCRCB_SWAPUVY, R32G32B32A32_FLOAT>::Load;
-    table[BC1_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 64>, BC1_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[BC2_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 128>, BC2_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[BC3_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 128>, BC3_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[BC4_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 64>, BC4_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[BC5_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 128>, BC5_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[BC1_UNORM_SRGB]                  = LoadMacroTile<TilingTraits<TTileMode, 64>, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[BC2_UNORM_SRGB]                  = LoadMacroTile<TilingTraits<TTileMode, 128>, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[BC3_UNORM_SRGB]                  = LoadMacroTile<TilingTraits<TTileMode, 128>, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[YCRCB_SWAPUV]                    = LoadMacroTile<TilingTraits<TTileMode, 32>, YCRCB_SWAPUV, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8_UNORM]                    = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8_SNORM]                    = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8_SSCALED]                  = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8_USCALED]                  = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[BC4_SNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 64>, BC4_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[BC5_SNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 128>, BC5_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16_FLOAT]                 = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16_UNORM]                 = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16_SNORM]                 = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16_SSCALED]               = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16_USCALED]               = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[BC6H_SF16]                       = LoadMacroTile<TilingTraits<TTileMode, 128>, BC6H_SF16, R32G32B32A32_FLOAT>::Load;
-    table[BC7_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 128>, BC7_UNORM, R32G32B32A32_FLOAT>::Load;
-    table[BC7_UNORM_SRGB]                  = LoadMacroTile<TilingTraits<TTileMode, 128>, BC7_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[BC6H_UF16]                       = LoadMacroTile<TilingTraits<TTileMode, 128>, BC6H_UF16, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8_UNORM_SRGB]               = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16_UINT]                  = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R16G16B16_SINT]                  = LoadMacroTile<TilingTraits<TTileMode, 48>, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R10G10B10A2_SNORM]               = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[R10G10B10A2_USCALED]             = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[R10G10B10A2_SSCALED]             = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[R10G10B10A2_SINT]                = LoadMacroTile<TilingTraits<TTileMode, 32>, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load;
-    table[B10G10R10A2_SNORM]               = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load;
-    table[B10G10R10A2_USCALED]             = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_USCALED, R32G32B32A32_FLOAT>::Load;
-    table[B10G10R10A2_SSCALED]             = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_SSCALED, R32G32B32A32_FLOAT>::Load;
-    table[B10G10R10A2_UINT]                = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load;
-    table[B10G10R10A2_SINT]                = LoadMacroTile<TilingTraits<TTileMode, 32>, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8_UINT]                     = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load;
-    table[R8G8B8_SINT]                     = LoadMacroTile<TilingTraits<TTileMode, 24>, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load;
-    table[RAW]                             = LoadMacroTile<TilingTraits<TTileMode, 8>, RAW, R32G32B32A32_FLOAT>::Load;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// InitLoadTileColorTable - Helper function for setting up the tables.
-template<SWR_TILE_MODE TTileMode>
-static INLINE void InitLoadTileDepthTable(PFN_LOAD_TILES(&table)[NUM_SWR_FORMATS])
-{
-    memset(table, 0, sizeof(table));
-
-   table[R32_FLOAT]                       = LoadMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Load;
-   table[R32_FLOAT_X8X24_TYPELESS]        = LoadMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT_X8X24_TYPELESS, R32_FLOAT>::Load;
-   table[R24_UNORM_X8_TYPELESS]           = LoadMacroTile<TilingTraits<TTileMode, 32>, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load;
-   table[R16_UNORM]                       = LoadMacroTile<TilingTraits<TTileMode, 16>, R16_UNORM, R32_FLOAT>::Load;
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Loads a full hottile from a render surface
-/// @param hPrivateContext - Handle to private DC
-/// @param dstFormat - Format for hot tile.
-/// @param renderTargetIndex - Index to src render target
-/// @param x, y - Coordinates to raster tile.
-/// @param pDstHotTile - Pointer to Hot Tile
-void SwrLoadHotTile(
-        HANDLE hWorkerPrivateData,
-        const SWR_SURFACE_STATE *pSrcSurface,
-        BucketManager* pBucketMgr,
-        SWR_FORMAT dstFormat,
-        SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
-        uint8_t *pDstHotTile);
diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_Linear.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_Linear.cpp
deleted file mode 100644
index 5f53b5b6b56..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_Linear.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file LoadTile.cpp
-* 
-* @brief Functionality for Load
-* 
-******************************************************************************/
-#include "LoadTile.h"
-
-PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
-PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Sets up tables for LoadTile
-void InitLoadTilesTable_Linear()
-{
-    InitLoadTileColorTable<SWR_TILE_NONE>(sLoadTilesColorTable_SWR_TILE_NONE);
-    InitLoadTileDepthTable<SWR_TILE_NONE>(sLoadTilesDepthTable_SWR_TILE_NONE);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileX.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileX.cpp
deleted file mode 100644
index 8e76655ff11..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileX.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file LoadTile.cpp
-* 
-* @brief Functionality for Load
-* 
-******************************************************************************/
-#include "LoadTile.h"
-
-PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS];
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Sets up tables for LoadTile
-void InitLoadTilesTable_XMajor()
-{
-    InitLoadTileColorTable<SWR_TILE_MODE_XMAJOR>(sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileY.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileY.cpp
deleted file mode 100644
index c136392eb78..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileY.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file LoadTile.cpp
-* 
-* @brief Functionality for Load
-* 
-******************************************************************************/
-#include "LoadTile.h"
-
-PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
-PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Sets up tables for LoadTile
-void InitLoadTilesTable_YMajor()
-{
-    InitLoadTileColorTable<SWR_TILE_MODE_YMAJOR>(sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR);
-    InitLoadTileDepthTable<SWR_TILE_MODE_YMAJOR>(sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
deleted file mode 100644
index 9fee13a045a..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file StoreTile.cpp
-* 
-* @brief Functionality for Store.
-* 
-******************************************************************************/
-#include "StoreTile.h"
-//////////////////////////////////////////////////////////////////////////
-/// Store Raster Tile Function Tables.
-//////////////////////////////////////////////////////////////////////////
-PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
-PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
-PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
-
-// on demand buckets for store tiles
-static std::mutex sBucketMutex;
-static std::vector<int32_t> sBuckets(NUM_SWR_FORMATS, -1);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Deswizzles and stores a full hottile to a render surface
-/// @param hPrivateContext - Handle to private DC
-/// @param srcFormat - Format for hot tile.
-/// @param renderTargetIndex - Index to destination render target
-/// @param x, y - Coordinates to raster tile.
-/// @param pSrcHotTile - Pointer to Hot Tile
-void SwrStoreHotTileToSurface(
-    HANDLE hWorkerPrivateData,
-    SWR_SURFACE_STATE *pDstSurface,
-    BucketManager* pBucketMgr,
-    SWR_FORMAT srcFormat,
-    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
-    uint8_t *pSrcHotTile)
-{
-    if (pDstSurface->type == SURFACE_NULL)
-    {
-        return;
-    }
-
-    // force 0 if requested renderTargetArrayIndex is OOB
-    if (renderTargetArrayIndex >= pDstSurface->depth)
-    {
-        renderTargetArrayIndex = 0;
-    }
-
-    PFN_STORE_TILES pfnStoreTiles = nullptr;
-
-    if (renderTargetIndex <= SWR_ATTACHMENT_COLOR7)
-    {
-        pfnStoreTiles = sStoreTilesTableColor[pDstSurface->tileMode][pDstSurface->format];
-    }
-    else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH)
-    {
-        pfnStoreTiles = sStoreTilesTableDepth[pDstSurface->tileMode][pDstSurface->format];
-    }
-    else
-    {
-        pfnStoreTiles = sStoreTilesTableStencil[pDstSurface->tileMode][pDstSurface->format];
-    }
-
-    if(nullptr == pfnStoreTiles)
-    {
-        SWR_INVALID("Invalid pixel format / tile mode for store tiles");
-        return;
-    }
-
-    // Store a macro tile
-#ifdef KNOB_ENABLE_RDTSC
-    if (sBuckets[pDstSurface->format] == -1)
-    {
-        // guard sBuckets update since storetiles is called by multiple threads
-        sBucketMutex.lock();
-        if (sBuckets[pDstSurface->format] == -1)
-        {
-            const SWR_FORMAT_INFO& info = GetFormatInfo(pDstSurface->format);
-            BUCKET_DESC desc{info.name, "", false, 0xffffffff};
-            sBuckets[pDstSurface->format] = pBucketMgr->RegisterBucket(desc);
-        }
-        sBucketMutex.unlock();
-    }
-#endif
-
-#ifdef KNOB_ENABLE_RDTSC
-    pBucketMgr->StartBucket(sBuckets[pDstSurface->format]);
-#endif
-    pfnStoreTiles(pSrcHotTile, pDstSurface, x, y, renderTargetArrayIndex);
-#ifdef KNOB_ENABLE_RDTSC
-    pBucketMgr->StopBucket(sBuckets[pDstSurface->format]);
-#endif
-
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Sets up tables for StoreTile
-void InitSimStoreTilesTable()
-{
-    memset(sStoreTilesTableColor, 0, sizeof(sStoreTilesTableColor));
-    memset(sStoreTilesTableDepth, 0, sizeof(sStoreTilesTableDepth));
-
-    InitStoreTilesTable_Linear_1();
-    InitStoreTilesTable_Linear_2();
-    InitStoreTilesTable_TileX_1();
-    InitStoreTilesTable_TileX_2();
-    InitStoreTilesTable_TileY_1();
-    InitStoreTilesTable_TileY_2();
-    InitStoreTilesTable_TileW();
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
deleted file mode 100644
index 1b7698cc5b8..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
+++ /dev/null
@@ -1,2051 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file StoreTile.h
-*
-* @brief Functionality for Store.
-*
-******************************************************************************/
-#pragma once
-
-#include "common/os.h"
-#include "common/formats.h"
-#include "core/context.h"
-#include "core/rdtsc_core.h"
-#include "core/format_conversion.h"
-
-#include "memory/TilingFunctions.h"
-#include "memory/Convert.h"
-#include "memory/SurfaceState.h"
-#include "core/multisample.h"
-
-#include <array>
-#include <sstream>
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-// Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
-typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
-
-//////////////////////////////////////////////////////////////////////////
-/// Store Raster Tile Function Tables.
-//////////////////////////////////////////////////////////////////////////
-extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
-extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
-extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
-
-void InitStoreTilesTable_Linear_1();
-void InitStoreTilesTable_Linear_2();
-void InitStoreTilesTable_TileX_1();
-void InitStoreTilesTable_TileX_2();
-void InitStoreTilesTable_TileY_1();
-void InitStoreTilesTable_TileY_2();
-void InitStoreTilesTable_TileW();
-void InitStoreTilesTable();
-
-//////////////////////////////////////////////////////////////////////////
-/// StorePixels
-/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-/// @param ppDsts   - Array of destination pointers.  Each pointer is
-///                   to a single row of at most 16B.
-/// @tparam NumDests - Number of destination pointers.  Each pair of
-///                    pointers is for a 16-byte column of two rows.
-//////////////////////////////////////////////////////////////////////////
-template <size_t PixelSize, size_t NumDests>
-struct StorePixels
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// StorePixels (32-bit pixel specialization)
-/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-/// @param ppDsts   - Array of destination pointers.  Each pointer is
-///                   to a single row of at most 16B.
-/// @tparam NumDests - Number of destination pointers.  Each pair of
-///                    pointers is for a 16-byte column of two rows.
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct StorePixels<8, 2>
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
-    {
-        // Each 4-pixel row is 4 bytes.
-        const uint16_t* pPixSrc = (const uint16_t*)pSrc;
-
-        // Unswizzle from SWR-Z order
-        uint16_t* pRow = (uint16_t*)ppDsts[0];
-        pRow[0] = pPixSrc[0];
-        pRow[1] = pPixSrc[2];
-
-        pRow = (uint16_t*)ppDsts[1];
-        pRow[0] = pPixSrc[1];
-        pRow[1] = pPixSrc[3];
-    }
-};
-
-template <>
-struct StorePixels<8, 4>
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
-    {
-        // 8 x 2 bytes = 16 bytes, 16 pixels
-        const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
-
-        uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
-
-        // Unswizzle from SWR-Z order
-        ppDsts16[0][0] = pSrc16[0];     // 0 1
-        ppDsts16[0][1] = pSrc16[2];     // 4 5
-
-        ppDsts16[1][0] = pSrc16[1];     // 2 3
-        ppDsts16[1][1] = pSrc16[3];     // 6 7
-
-        ppDsts16[2][0] = pSrc16[4];     // 8 9
-        ppDsts16[2][1] = pSrc16[6];     // C D
-
-        ppDsts16[3][0] = pSrc16[5];     // A B
-        ppDsts16[3][1] = pSrc16[7];     // E F
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// StorePixels (32-bit pixel specialization)
-/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-/// @param ppDsts   - Array of destination pointers.  Each pointer is
-///                   to a single row of at most 16B.
-/// @tparam NumDests - Number of destination pointers.  Each pair of
-///                    pointers is for a 16-byte column of two rows.
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct StorePixels<16, 2>
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
-    {
-        // Each 4-pixel row is 8 bytes.
-        const uint32_t* pPixSrc = (const uint32_t*)pSrc;
-
-        // Unswizzle from SWR-Z order
-        uint32_t* pRow = (uint32_t*)ppDsts[0];
-        pRow[0] = pPixSrc[0];
-        pRow[1] = pPixSrc[2];
-
-        pRow = (uint32_t*)ppDsts[1];
-        pRow[0] = pPixSrc[1];
-        pRow[1] = pPixSrc[3];
-    }
-};
-
-template <>
-struct StorePixels<16, 4>
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
-    {
-        // 8 x 4 bytes = 32 bytes, 16 pixels
-        const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
-
-        uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
-
-        // Unswizzle from SWR-Z order
-        ppDsts32[0][0] = pSrc32[0];     // 0 1
-        ppDsts32[0][1] = pSrc32[2];     // 4 5
-
-        ppDsts32[1][0] = pSrc32[1];     // 2 3
-        ppDsts32[1][1] = pSrc32[3];     // 6 7
-
-        ppDsts32[2][0] = pSrc32[4];     // 8 9
-        ppDsts32[2][1] = pSrc32[6];     // C D
-
-        ppDsts32[3][0] = pSrc32[5];     // A B
-        ppDsts32[3][1] = pSrc32[7];     // E F
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// StorePixels (32-bit pixel specialization)
-/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-/// @param ppDsts   - Array of destination pointers.  Each pointer is
-///                   to a single row of at most 16B.
-/// @tparam NumDests - Number of destination pointers.  Each pair of
-///                    pointers is for a 16-byte column of two rows.
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct StorePixels<32, 2>
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
-    {
-        // Each 4-pixel row is 16-bytes
-        simd4scalari *pZRow01 = (simd4scalari*)pSrc;
-        simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
-        simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
-
-        simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
-        simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
-
-        SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
-        SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
-    }
-};
-
-template <>
-struct StorePixels<32, 4>
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
-    {
-        // 4 x 16 bytes = 64 bytes, 16 pixels
-        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
-
-        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
-
-        // Unswizzle from SWR-Z order
-        simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]);                        // 0 1 2 3
-        simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]);                        // 4 5 6 7
-        simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]);                        // 8 9 A B
-        simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]);                        // C D E F
-
-        SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1));   // 0 1 4 5
-        SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1));   // 2 3 6 7
-        SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3));   // 8 9 C D
-        SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3));   // A B E F
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// StorePixels (32-bit pixel specialization)
-/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-/// @param ppDsts   - Array of destination pointers.  Each pointer is
-///                   to a single row of at most 16B.
-/// @tparam NumDests - Number of destination pointers.  Each pair of
-///                    pointers is for a 16-byte column of two rows.
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct StorePixels<64, 4>
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
-    {
-        // Each 4-pixel row is 32 bytes.
-        const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
-
-        // order of pointers match SWR-Z layout
-        simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
-        *pvDsts[0] = pPixSrc[0];
-        *pvDsts[1] = pPixSrc[1];
-        *pvDsts[2] = pPixSrc[2];
-        *pvDsts[3] = pPixSrc[3];
-    }
-};
-
-template <>
-struct StorePixels<64, 8>
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
-    {
-        // 8 x 16 bytes = 128 bytes, 16 pixels
-        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
-
-        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
-
-        // order of pointers match SWR-Z layout
-        *ppDsts128[0] = pSrc128[0];     // 0 1
-        *ppDsts128[1] = pSrc128[1];     // 2 3
-        *ppDsts128[2] = pSrc128[2];     // 4 5
-        *ppDsts128[3] = pSrc128[3];     // 6 7
-        *ppDsts128[4] = pSrc128[4];     // 8 9
-        *ppDsts128[5] = pSrc128[5];     // A B
-        *ppDsts128[6] = pSrc128[6];     // C D
-        *ppDsts128[7] = pSrc128[7];     // E F
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// StorePixels (32-bit pixel specialization)
-/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-/// @param ppDsts   - Array of destination pointers.  Each pointer is
-///                   to a single row of at most 16B.
-/// @tparam NumDests - Number of destination pointers.  Each pair of
-///                    pointers is for a 16-byte column of two rows.
-//////////////////////////////////////////////////////////////////////////
-template <>
-struct StorePixels<128, 8>
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
-    {
-        // Each 4-pixel row is 64 bytes.
-        const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
-
-        // Unswizzle from SWR-Z order
-        simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
-        *pvDsts[0] = pPixSrc[0];
-        *pvDsts[1] = pPixSrc[2];
-        *pvDsts[2] = pPixSrc[1];
-        *pvDsts[3] = pPixSrc[3];
-        *pvDsts[4] = pPixSrc[4];
-        *pvDsts[5] = pPixSrc[6];
-        *pvDsts[6] = pPixSrc[5];
-        *pvDsts[7] = pPixSrc[7];
-    }
-};
-
-template <>
-struct StorePixels<128, 16>
-{
-    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
-    {
-        // 16 x 16 bytes = 256 bytes, 16 pixels
-        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
-
-        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
-
-        for (uint32_t i = 0; i < 16; i += 4)
-        {
-            *ppDsts128[i + 0] = pSrc128[i + 0];
-            *ppDsts128[i + 1] = pSrc128[i + 2];
-            *ppDsts128[i + 2] = pSrc128[i + 1];
-            *ppDsts128[i + 3] = pSrc128[i + 3];
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct ConvertPixelsSOAtoAOS
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Converts a SIMD from the Hot Tile to the destination format
-    ///        and converts from SOA to AOS.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDst - Pointer to destination surface or deswizzling buffer.
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
-
-        OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES] = {0};
-        OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES] = {0};
-
-        // Convert from SrcFormat --> DstFormat
-        simd16vector src;
-        LoadSOA<SrcFormat>(pSrc, src);
-        StoreSOA<DstFormat>(src, soaTile);
-
-        // Convert from SOA --> AOS
-        FormatTraits<DstFormat>::TransposeT::Transpose_simd16(soaTile, aosTile);
-
-        // Store data into destination
-        StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
-/// Specialization for no format conversion
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT Format>
-struct ConvertPixelsSOAtoAOS<Format, Format>
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Converts a SIMD from the Hot Tile to the destination format
-    ///        and converts from SOA to AOS.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDst - Pointer to destination surface or deswizzling buffer.
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
-
-        OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
-
-        // Convert from SOA --> AOS
-        FormatTraits<Format>::TransposeT::Transpose_simd16(pSrc, aosTile);
-
-        // Store data into destination
-        StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
-//////////////////////////////////////////////////////////////////////////
-template<>
-struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Converts a SIMD from the Hot Tile to the destination format
-    ///        and converts from SOA to AOS.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDst - Pointer to destination surface or deswizzling buffer.
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
-        static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
-
-        static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
-
-        OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
-
-        // Load hot-tile
-        simd16vector src, dst;
-        LoadSOA<SrcFormat>(pSrc, src);
-
-        // deswizzle
-        dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
-        dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
-        dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
-
-        // clamp
-        dst.x = Clamp<DstFormat>(dst.x, 0);
-        dst.y = Clamp<DstFormat>(dst.y, 1);
-        dst.z = Clamp<DstFormat>(dst.z, 2);
-
-        // normalize
-        dst.x = Normalize<DstFormat>(dst.x, 0);
-        dst.y = Normalize<DstFormat>(dst.y, 1);
-        dst.z = Normalize<DstFormat>(dst.z, 2);
-
-        // pack
-        simd16scalari packed = _simd16_castps_si(dst.x);
-
-        SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
-        SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
-
-        packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
-        packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
-
-        // pack low 16 bits of each 32 bit lane to low 128 bits of dst
-        uint32_t *pPacked = (uint32_t*)&packed;
-        uint16_t *pAosTile = (uint16_t*)&aosTile[0];
-        for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
-        {
-            *pAosTile++ = *pPacked++;
-        }
-
-        // Store data into destination
-        StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
-//////////////////////////////////////////////////////////////////////////
-template<>
-struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
-{
-    static const SWR_FORMAT SrcFormat = R32_FLOAT;
-    static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Converts a SIMD from the Hot Tile to the destination format
-    ///        and converts from SOA to AOS.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDst - Pointer to destination surface or deswizzling buffer.
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
-
-        // clamp
-        const simd16scalar zero = _simd16_setzero_ps();
-        const simd16scalar ones = _simd16_set1_ps(1.0f);
-
-        comp = _simd16_max_ps(comp, zero);
-        comp = _simd16_min_ps(comp, ones);
-
-        // normalize
-        comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
-
-        simd16scalari temp = _simd16_cvtps_epi32(comp);
-
-        // swizzle
-        temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
-
-        // merge/store data into destination but don't overwrite the X8 bits
-        simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
-        simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
-
-        simd16scalari dest = _simd16_setzero_si();
-
-        dest = _simd16_insert_si(dest, destlo, 0);
-        dest = _simd16_insert_si(dest, desthi, 1);
-
-        simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
-
-        dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
-
-        _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
-        _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
-    }
-};
-
-template<SWR_FORMAT DstFormat>
-INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
-{
-    // swizzle rgba -> bgra while we load
-    simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
-    simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
-    simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
-    simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
-
-    // clamp
-    const simd16scalar zero = _simd16_setzero_ps();
-    const simd16scalar ones = _simd16_set1_ps(1.0f);
-
-    comp0 = _simd16_max_ps(comp0, zero);
-    comp0 = _simd16_min_ps(comp0, ones);
-
-    comp1 = _simd16_max_ps(comp1, zero);
-    comp1 = _simd16_min_ps(comp1, ones);
-
-    comp2 = _simd16_max_ps(comp2, zero);
-    comp2 = _simd16_min_ps(comp2, ones);
-
-    comp3 = _simd16_max_ps(comp3, zero);
-    comp3 = _simd16_min_ps(comp3, ones);
-
-    // gamma-correct only rgb
-    if (FormatTraits<DstFormat>::isSRGB)
-    {
-        comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
-        comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
-        comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
-    }
-
-    // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
-    comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
-    comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
-    comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
-    comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
-
-    // moving to 16 wide integer vector types
-    simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
-    simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
-    simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
-    simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
-
-    // SOA to AOS conversion
-    src1 = _simd16_slli_epi32(src1,  8);
-    src2 = _simd16_slli_epi32(src2, 16);
-    src3 = _simd16_slli_epi32(src3, 24);
-
-    simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
-
-    // de-swizzle conversion
-#if 1
-    simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
-    simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
-
-    final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
-
-#else
-    final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
-
-#endif
-    // store 8x2 memory order:
-    //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
-    //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
-    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
-    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
-}
-
-template<SWR_FORMAT DstFormat>
-INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
-{
-    static const uint32_t offset = sizeof(simdscalar);
-
-    // swizzle rgba -> bgra while we load
-    simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
-    simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
-    simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
-    simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
-
-    // clamp
-    vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
-    vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
-
-    vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
-    vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
-
-    vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
-    vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
-
-    vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
-    vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
-
-    if (FormatTraits<DstFormat>::isSRGB)
-    {
-        // Gamma-correct only rgb
-        vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
-        vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
-        vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
-    }
-
-    // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
-    vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
-    vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
-    vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
-    vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
-
-    // moving to 8 wide integer vector types
-    simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
-    simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
-    simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
-    simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
-
-#if KNOB_ARCH <= KNOB_ARCH_AVX
-
-    // splitting into two sets of 4 wide integer vector types
-    // because AVX doesn't have instructions to support this operation at 8 wide
-    simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
-    simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
-    simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
-    simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
-
-    simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
-    simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
-    simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
-    simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
-
-    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
-    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
-    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
-    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
-    srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
-    srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
-
-    srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
-    srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
-
-    srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
-    srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
-
-    srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
-    srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
-
-    // unpack into rows that get the tiling order correct
-    simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
-    simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
-
-    simdscalari final = _mm256_castsi128_si256(vRow00);
-    final = _mm256_insertf128_si256(final, vRow10, 1);
-
-#else
-
-    // logic is as above, only wider
-    src1 = _mm256_slli_si256(src1, 1);
-    src2 = _mm256_slli_si256(src2, 2);
-    src3 = _mm256_slli_si256(src3, 3);
-
-    src0 = _mm256_or_si256(src0, src1);
-    src2 = _mm256_or_si256(src2, src3);
-
-    simdscalari final = _mm256_or_si256(src0, src2);
-
-    // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
-    final = _mm256_permute4x64_epi64(final, 0xD8);
-#endif
-
-    _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
-}
-
-template<SWR_FORMAT DstFormat>
-INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
-{
-    // swizzle rgba -> bgra while we load
-    simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
-    simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
-    simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
-
-    // clamp
-    const simd16scalar zero = _simd16_setzero_ps();
-    const simd16scalar ones = _simd16_set1_ps(1.0f);
-
-    comp0 = _simd16_max_ps(comp0, zero);
-    comp0 = _simd16_min_ps(comp0, ones);
-
-    comp1 = _simd16_max_ps(comp1, zero);
-    comp1 = _simd16_min_ps(comp1, ones);
-
-    comp2 = _simd16_max_ps(comp2, zero);
-    comp2 = _simd16_min_ps(comp2, ones);
-
-    // gamma-correct only rgb
-    if (FormatTraits<DstFormat>::isSRGB)
-    {
-        comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
-        comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
-        comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
-    }
-
-    // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
-    comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
-    comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
-    comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
-
-    // moving to 16 wide integer vector types
-    simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
-    simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
-    simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
-
-    // SOA to AOS conversion
-    src1 = _simd16_slli_epi32(src1,  8);
-    src2 = _simd16_slli_epi32(src2, 16);
-
-    simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2);                       // 0 1 2 3 4 5 6 7 8 9 A B C D E F
-
-    // de-swizzle conversion
-#if 1
-    simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
-    simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
-
-    final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
-
-#else
-    final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
-
-#endif
-    // store 8x2 memory order:
-    //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
-    //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
-    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
-    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
-}
-
-template<SWR_FORMAT DstFormat>
-INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
-{
-    static const uint32_t offset = sizeof(simdscalar);
-
-    // swizzle rgba -> bgra while we load
-    simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
-    simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
-    simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
-                                                                                                            // clamp
-    vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
-    vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
-
-    vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
-    vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
-
-    vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
-    vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
-
-    if (FormatTraits<DstFormat>::isSRGB)
-    {
-        // Gamma-correct only rgb
-        vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
-        vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
-        vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
-    }
-
-    // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
-    vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
-    vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
-    vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
-
-    // moving to 8 wide integer vector types
-    simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
-    simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
-    simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
-
-#if KNOB_ARCH <= KNOB_ARCH_AVX
-
-    // splitting into two sets of 4 wide integer vector types
-    // because AVX doesn't have instructions to support this operation at 8 wide
-    simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
-    simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
-    simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
-
-    simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
-    simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
-    simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
-
-    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
-    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
-    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
-    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
-
-    srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
-
-    srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
-
-    srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
-    srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
-
-    // unpack into rows that get the tiling order correct
-    simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
-    simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
-
-    simdscalari final = _mm256_castsi128_si256(vRow00);
-    final = _mm256_insertf128_si256(final, vRow10, 1);
-
-#else
-
-                                              // logic is as above, only wider
-    src1 = _mm256_slli_si256(src1, 1);
-    src2 = _mm256_slli_si256(src2, 2);
-
-    src0 = _mm256_or_si256(src0, src1);
-
-    simdscalari final = _mm256_or_si256(src0, src2);
-
-    // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
-    final = _mm256_permute4x64_epi64(final, 0xD8);
-
-#endif
-
-    _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
-}
-
-template<>
-struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
-{
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-    }
-};
-
-template<>
-struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
-{
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-    }
-};
-
-template<>
-struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
-{
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-    }
-};
-
-template<>
-struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
-{
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-    }
-};
-
-template<>
-struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
-{
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-    }
-};
-
-template<>
-struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
-{
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-    }
-};
-
-template<>
-struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
-{
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-    }
-};
-
-template<>
-struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
-{
-    template <size_t NumDests>
-    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-    {
-        FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// StoreRasterTile
-//////////////////////////////////////////////////////////////////////////
-template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct StoreRasterTile
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Retrieve color from hot tile source which is always float.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param x, y - Coordinates to raster tile.
-    /// @param output - output color
-    INLINE static void GetSwizzledSrcColor(
-        uint8_t* pSrc,
-        uint32_t x, uint32_t y,
-        float outputColor[4])
-    {
-        typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
-
-        SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
-
-        // Compute which simd tile we're accessing within 8x8 tile.
-        //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
-        uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
-
-        SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
-
-        uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
-
-        pSimdTile->GetSwizzledColor(simdOffset, outputColor);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
-    {
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        // For each raster tile pixel (rx, ry)
-        for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
-        {
-            for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
-            {
-                // Perform bounds checking.
-                if (((x + rx) < lodWidth) &&
-                    ((y + ry) < lodHeight))
-                {
-                    float srcColor[4];
-                    GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
-
-                    uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
-                        pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
-                        sampleNum, pDstSurface->lod, pDstSurface);
-                    {
-                        ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
-                    }
-                }
-            }
-        }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    /// @param sampleOffset - Offset between adjacent multisamples
-    INLINE static void Resolve(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
-    {
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        float oneOverNumSamples = 1.0f / pDstSurface->numSamples;
-
-        // For each raster tile pixel (rx, ry)
-        for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
-        {
-            for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
-            {
-                // Perform bounds checking.
-                if (((x + rx) < lodWidth) &&
-                        ((y + ry) < lodHeight))
-                {
-                    // Sum across samples
-                    float resolveColor[4] = {0};
-                    for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
-                    {
-                        float sampleColor[4] = {0};
-                        uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum;
-                        GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor);
-                        resolveColor[0] += sampleColor[0];
-                        resolveColor[1] += sampleColor[1];
-                        resolveColor[2] += sampleColor[2];
-                        resolveColor[3] += sampleColor[3];
-                    }
-
-                    // Divide by numSamples to average
-                    resolveColor[0] *= oneOverNumSamples;
-                    resolveColor[1] *= oneOverNumSamples;
-                    resolveColor[2] *= oneOverNumSamples;
-                    resolveColor[3] *= oneOverNumSamples;
-
-                    // Use the resolve surface state
-                    SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
-                    uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
-                        pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex,
-                        0, pResolveSurface->lod, pResolveSurface);
-                    {
-                        ConvertPixelFromFloat<DstFormat>(pDst, resolveColor);
-                    }
-                }
-            }
-        }
-    }
-
-};
-
-template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
-{};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
-        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
-
-        uint8_t* ppDsts[] =
-        {
-            pDst,                                           // row 0, col 0
-            pDst + pDstSurface->pitch,                      // row 1, col 0
-            pDst + dx / 2,                                  // row 0, col 1
-            pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
-            {
-                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-                ppDsts[0] += dx;
-                ppDsts[1] += dx;
-                ppDsts[2] += dx;
-                ppDsts[3] += dx;
-            }
-
-            ppDsts[0] += dy;
-            ppDsts[1] += dy;
-            ppDsts[2] += dy;
-            ppDsts[3] += dy;
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
-        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
-
-        uint8_t* ppDsts[] =
-        {
-            pDst,                                           // row 0, col 0
-            pDst + pDstSurface->pitch,                      // row 1, col 0
-            pDst + dx / 2,                                  // row 0, col 1
-            pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
-            {
-                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-                ppDsts[0] += dx;
-                ppDsts[1] += dx;
-                ppDsts[2] += dx;
-                ppDsts[3] += dx;
-            }
-
-            ppDsts[0] += dy;
-            ppDsts[1] += dy;
-            ppDsts[2] += dy;
-            ppDsts[3] += dy;
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
-        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
-
-        uint8_t* ppDsts[] =
-        {
-            pDst,                                           // row 0, col 0
-            pDst + pDstSurface->pitch,                      // row 1, col 0
-            pDst + dx / 2,                                  // row 0, col 1
-            pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
-            {
-                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-                ppDsts[0] += dx;
-                ppDsts[1] += dx;
-                ppDsts[2] += dx;
-                ppDsts[3] += dx;
-            }
-
-            ppDsts[0] += dy;
-            ppDsts[1] += dy;
-            ppDsts[2] += dy;
-            ppDsts[3] += dy;
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-    static const size_t MAX_DST_COLUMN_BYTES = 16;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
-        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
-
-        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
-        static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
-
-        uint8_t *ppDsts[] =
-        {
-            pDst,                                                               // row 0, col 0
-            pDst + pDstSurface->pitch,                                          // row 1, col 0
-            pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
-            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
-            pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
-            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
-            pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
-            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3                // row 1, col 3
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            // Raster tile width is same as simd16 tile width
-            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
-
-            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-            for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
-            {
-                ppDsts[i] += dy;
-            }
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-    static const size_t MAX_DST_COLUMN_BYTES = 16;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
-        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
-
-        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
-        static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
-
-        uint8_t* ppDsts[] =
-        {
-            pDst,                                                               // row 0, col 0
-            pDst + pDstSurface->pitch,                                          // row 1, col 0
-            pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
-            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
-            pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
-            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
-            pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
-            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3,               // row 1, col 3
-            pDst + MAX_DST_COLUMN_BYTES * 4,                                    // row 0, col 4
-            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4,               // row 1, col 4
-            pDst + MAX_DST_COLUMN_BYTES * 5,                                    // row 0, col 5
-            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5,               // row 1, col 5
-            pDst + MAX_DST_COLUMN_BYTES * 6,                                    // row 0, col 6
-            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6,               // row 1, col 6
-            pDst + MAX_DST_COLUMN_BYTES * 7,                                    // row 0, col 7
-            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7,               // row 1, col 7
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            // Raster tile width is same as simd16 tile width
-            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
-
-            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-            for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
-            {
-                ppDsts[i] += dy;
-            }
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
-
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
-        // We can compute the offsets to each column within the raster tile once and increment from these.
-        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
-
-        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
-        uint8_t *ppDsts[] =
-        {
-            pDst,
-            pDst + DestRowWidthBytes,
-            pDst + DestRowWidthBytes / 4,
-            pDst + DestRowWidthBytes + DestRowWidthBytes / 4
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            // Raster tile width is same as simd16 tile width
-            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
-
-            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-            ppDsts[0] += dy;
-            ppDsts[1] += dy;
-            ppDsts[2] += dy;
-            ppDsts[3] += dy;
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
-
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
-        // We can compute the offsets to each column within the raster tile once and increment from these.
-        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
-
-        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
-        uint8_t *ppDsts[] =
-        {
-            pDst,
-            pDst + DestRowWidthBytes,
-            pDst + DestRowWidthBytes / 2,
-            pDst + DestRowWidthBytes + DestRowWidthBytes / 2
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            // Raster tile width is same as simd16 tile width
-            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
-
-            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-            ppDsts[0] += dy;
-            ppDsts[1] += dy;
-            ppDsts[2] += dy;
-            ppDsts[3] += dy;
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        static const uint32_t DestRowWidthBytes = 512;                   // 512B rows
-
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
-        // We can compute the offsets to each column within the raster tile once and increment from these.
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
-        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
-
-        uint8_t* ppDsts[] =
-        {
-            pDst,                                           // row 0, col 0
-            pDst + DestRowWidthBytes,                       // row 1, col 0
-            pDst + dx / 2,                                  // row 0, col 1
-            pDst + DestRowWidthBytes + dx / 2               // row 1, col 1
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
-            {
-                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-                ppDsts[0] += dx;
-                ppDsts[1] += dx;
-                ppDsts[2] += dx;
-                ppDsts[3] += dx;
-            }
-
-            ppDsts[0] += dy;
-            ppDsts[1] += dy;
-            ppDsts[2] += dy;
-            ppDsts[3] += dy;
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
-        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
-
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
-        // We can compute the offsets to each column within the raster tile once and increment from these.
-        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
-        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
-
-        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
-        uint8_t *ppDsts[] =
-        {
-            pDst,                                           // row 0, col 0
-            pDst + DestRowWidthBytes,                       // row 1, col 0
-            pDst + DestColumnBytes,                         // row 0, col 1
-            pDst + DestRowWidthBytes + DestColumnBytes      // row 1, col 1
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            // Raster tile width is same as simd16 tile width
-            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
-
-            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-            ppDsts[0] += dy;
-            ppDsts[1] += dy;
-            ppDsts[2] += dy;
-            ppDsts[3] += dy;
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
-        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
-
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
-        // We can compute the offsets to each column within the raster tile once and increment from these.
-        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
-        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
-
-        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
-        uint8_t *ppDsts[] =
-        {
-            pDst,                                           // row 0, col 0
-            pDst + DestRowWidthBytes,                       // row 1, col 0
-            pDst + DestColumnBytes,                         // row 0, col 1
-            pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
-            pDst + DestColumnBytes * 2,                     // row 0, col 2
-            pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
-            pDst + DestColumnBytes * 3,                     // row 0, col 3
-            pDst + DestRowWidthBytes + DestColumnBytes * 3  // row 1, col 3
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            // Raster tile width is same as simd16 tile width
-            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
-
-            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-            for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
-            {
-                ppDsts[i] += dy;
-            }
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
-{
-    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
-    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores an 8x8 raster tile to the destination surface.
-    /// @param pSrc - Pointer to raster tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to raster tile.
-    INLINE static void Store(
-        uint8_t *pSrc,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-    {
-        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
-        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
-
-        // Punt non-full tiles to generic store
-        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-
-        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
-        {
-            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-        }
-
-        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
-        // We can compute the offsets to each column within the raster tile once and increment from these.
-        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
-        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
-        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
-        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
-
-        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
-        uint8_t *ppDsts[] =
-        {
-            pDst,                                           // row 0, col 0
-            pDst + DestRowWidthBytes,                       // row 1, col 0
-            pDst + DestColumnBytes,                         // row 0, col 1
-            pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
-            pDst + DestColumnBytes * 2,                     // row 0, col 2
-            pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
-            pDst + DestColumnBytes * 3,                     // row 0, col 3
-            pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
-            pDst + DestColumnBytes * 4,                     // row 0, col 4
-            pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
-            pDst + DestColumnBytes * 5,                     // row 0, col 5
-            pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
-            pDst + DestColumnBytes * 6,                     // row 0, col 6
-            pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
-            pDst + DestColumnBytes * 7,                     // row 0, col 7
-            pDst + DestRowWidthBytes + DestColumnBytes * 7  // row 1, col 7
-        };
-
-        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
-        {
-            // Raster tile width is same as simd16 tile width
-            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
-
-            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
-            for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
-            {
-                ppDsts[i] += dy;
-            }
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// StoreMacroTile - Stores a macro tile which consists of raster tiles.
-//////////////////////////////////////////////////////////////////////////
-template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-struct StoreMacroTile
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores a macrotile to the destination surface using safe implementation.
-    /// @param pSrc - Pointer to macro tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to macro tile
-    static void StoreGeneric(
-        uint8_t *pSrcHotTile,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
-    {
-        PFN_STORE_TILES_INTERNAL pfnStore;
-        pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
-
-        // Store each raster tile from the hot tile to the destination surface.
-        for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-        {
-            for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-            {
-                for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
-                {
-                    pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
-                    pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
-                }
-            }
-        }
-
-    }
-
-    typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Stores a macrotile to the destination surface.
-    /// @param pSrc - Pointer to macro tile.
-    /// @param pDstSurface - Destination surface state
-    /// @param x, y - Coordinates to macro tile
-    static void Store(
-        uint8_t *pSrcHotTile,
-        SWR_SURFACE_STATE* pDstSurface,
-        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
-    {
-        PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
-
-        for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
-        {
-            size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
-                0,
-                0,
-                pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
-                pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
-                sampleNum,
-                pDstSurface->lod,
-                pDstSurface);
-
-            // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
-            bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
-                (pDstSurface->bInterleavedSamples);
-
-            pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
-        }
-
-        // Save original for pSrcHotTile resolve.
-        uint8_t *pResolveSrcHotTile = pSrcHotTile;
-
-        // Store each raster tile from the hot tile to the destination surface.
-        for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-        {
-            for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-            {
-                for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
-                {
-                    pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
-                    pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
-                }
-            }
-        }
-
-        if (pDstSurface->xpAuxBaseAddress)
-        {
-            uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
-            // Store each raster tile from the hot tile to the destination surface.
-            for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-            {
-                for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-                {
-                    StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex);
-                    pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples;
-                }
-            }
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// InitStoreTilesTable - Helper for setting up the tables.
-template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
-void InitStoreTilesTableColor_Half1(
-    PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
-{
-    table[TTileMode][R32G32B32A32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
-    table[TTileMode][R32G32B32A32_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
-    table[TTileMode][R32G32B32A32_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
-    table[TTileMode][R32G32B32X32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
-    table[TTileMode][R32G32B32A32_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
-    table[TTileMode][R32G32B32A32_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
-    table[TTileMode][R32G32B32_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
-    table[TTileMode][R32G32B32_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
-    table[TTileMode][R32G32B32_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
-    table[TTileMode][R32G32B32_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
-    table[TTileMode][R32G32B32_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
-    table[TTileMode][R16G16B16A16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
-    table[TTileMode][R16G16B16A16_SNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
-    table[TTileMode][R16G16B16A16_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
-    table[TTileMode][R16G16B16A16_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
-    table[TTileMode][R16G16B16A16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
-    table[TTileMode][R32G32_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
-    table[TTileMode][R32G32_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
-    table[TTileMode][R32G32_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
-    table[TTileMode][R32_FLOAT_X8X24_TYPELESS]      = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
-    table[TTileMode][X32_TYPELESS_G8X24_UINT]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
-    table[TTileMode][R16G16B16X16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
-    table[TTileMode][R16G16B16X16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
-    table[TTileMode][R16G16B16A16_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
-    table[TTileMode][R16G16B16A16_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
-    table[TTileMode][R32G32_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
-    table[TTileMode][R32G32_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
-    table[TTileMode][B8G8R8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
-    table[TTileMode][B8G8R8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
-    table[TTileMode][R10G10B10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
-    table[TTileMode][R10G10B10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
-    table[TTileMode][R10G10B10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
-    table[TTileMode][R8G8B8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
-    table[TTileMode][R8G8B8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
-    table[TTileMode][R8G8B8A8_SNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
-    table[TTileMode][R8G8B8A8_SINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
-    table[TTileMode][R8G8B8A8_UINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
-    table[TTileMode][R16G16_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
-    table[TTileMode][R16G16_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
-    table[TTileMode][R16G16_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
-    table[TTileMode][R16G16_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
-    table[TTileMode][R16G16_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
-    table[TTileMode][B10G10R10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
-    table[TTileMode][B10G10R10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
-    table[TTileMode][R11G11B10_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
-    table[TTileMode][R10G10B10_FLOAT_A2_UNORM]      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
-    table[TTileMode][R32_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
-    table[TTileMode][R32_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
-    table[TTileMode][R32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
-    table[TTileMode][R24_UNORM_X8_TYPELESS]         = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
-    table[TTileMode][X24_TYPELESS_G8_UINT]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
-    table[TTileMode][A32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
-    table[TTileMode][B8G8R8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
-    table[TTileMode][B8G8R8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
-    table[TTileMode][R8G8B8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
-    table[TTileMode][R8G8B8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
-}
-
-template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
-void InitStoreTilesTableColor_Half2(
-    PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
-{
-    table[TTileMode][R9G9B9E5_SHAREDEXP]            = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
-    table[TTileMode][B10G10R10X2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
-    table[TTileMode][R10G10B10X2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
-    table[TTileMode][R8G8B8A8_SSCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
-    table[TTileMode][R8G8B8A8_USCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
-    table[TTileMode][R16G16_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
-    table[TTileMode][R16G16_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
-    table[TTileMode][R32_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
-    table[TTileMode][R32_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
-    table[TTileMode][B5G6R5_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
-    table[TTileMode][B5G6R5_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
-    table[TTileMode][B5G5R5A1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
-    table[TTileMode][B5G5R5A1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
-    table[TTileMode][B4G4R4A4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
-    table[TTileMode][B4G4R4A4_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
-    table[TTileMode][R8G8_UNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
-    table[TTileMode][R8G8_SNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
-    table[TTileMode][R8G8_SINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
-    table[TTileMode][R8G8_UINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
-    table[TTileMode][R16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
-    table[TTileMode][R16_SNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
-    table[TTileMode][R16_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
-    table[TTileMode][R16_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
-    table[TTileMode][R16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
-    table[TTileMode][A16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
-    table[TTileMode][A16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
-    table[TTileMode][B5G5R5X1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
-    table[TTileMode][B5G5R5X1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
-    table[TTileMode][R8G8_SSCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
-    table[TTileMode][R8G8_USCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
-    table[TTileMode][R16_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
-    table[TTileMode][R16_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
-    table[TTileMode][A1B5G5R5_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
-    table[TTileMode][A4B4G4R4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
-    table[TTileMode][R8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
-    table[TTileMode][R8_SNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
-    table[TTileMode][R8_SINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
-    table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
-    table[TTileMode][A8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
-    table[TTileMode][R8_SSCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
-    table[TTileMode][R8_USCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
-    table[TTileMode][R8G8B8_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
-    table[TTileMode][R8G8B8_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
-    table[TTileMode][R8G8B8_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
-    table[TTileMode][R8G8B8_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
-    table[TTileMode][R16G16B16_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
-    table[TTileMode][R16G16B16_UNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
-    table[TTileMode][R16G16B16_SNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
-    table[TTileMode][R16G16B16_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
-    table[TTileMode][R16G16B16_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
-    table[TTileMode][R8G8B8_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
-    table[TTileMode][R16G16B16_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
-    table[TTileMode][R16G16B16_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
-    table[TTileMode][R10G10B10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
-    table[TTileMode][R10G10B10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
-    table[TTileMode][R10G10B10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
-    table[TTileMode][R10G10B10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
-    table[TTileMode][B10G10R10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
-    table[TTileMode][B10G10R10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
-    table[TTileMode][B10G10R10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
-    table[TTileMode][B10G10R10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
-    table[TTileMode][B10G10R10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
-    table[TTileMode][R8G8B8_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
-    table[TTileMode][R8G8B8_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
-template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
-void InitStoreTilesTableDepth(
-    PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
-{
-   table[TTileMode][R32_FLOAT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
-   table[TTileMode][R32_FLOAT_X8X24_TYPELESS]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
-   table[TTileMode][R24_UNORM_X8_TYPELESS]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
-   table[TTileMode][R16_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
-}
-
-template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
-void InitStoreTilesTableStencil(
-    PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
-{
-    table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Deswizzles and stores a full hottile to a render surface
-/// @param hPrivateContext - Handle to private DC
-/// @param srcFormat - Format for hot tile.
-/// @param renderTargetIndex - Index to destination render target
-/// @param x, y - Coordinates to raster tile.
-/// @param pSrcHotTile - Pointer to Hot Tile
-void SwrStoreHotTileToSurface(
-        HANDLE hWorkerPrivateData,
-        SWR_SURFACE_STATE *pDstSurface,
-	 BucketManager* pBucketMgr,
-        SWR_FORMAT srcFormat,
-        SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
-        uint8_t *pSrcHotTile);
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear.cpp
deleted file mode 100644
index c72063f6f1d..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file StoreTile_Linear.cpp
-*
-* @brief Functionality for Store.
-*
-******************************************************************************/
-#include "StoreTile.h"
-
-void InitStoreTilesTable_Linear_1()
-{
-    InitStoreTilesTableColor_Half1<SWR_TILE_NONE>(sStoreTilesTableColor);
-    InitStoreTilesTableDepth<SWR_TILE_NONE>(sStoreTilesTableDepth);
-    InitStoreTilesTableStencil<SWR_TILE_NONE>(sStoreTilesTableStencil);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear2.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear2.cpp
deleted file mode 100644
index 035e685e261..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear2.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file StoreTile_Linear.cpp
-*
-* @brief Functionality for Store.
-*
-******************************************************************************/
-#include "StoreTile.h"
-
-void InitStoreTilesTable_Linear_2()
-{
-    InitStoreTilesTableColor_Half2<SWR_TILE_NONE>(sStoreTilesTableColor);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileW.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileW.cpp
deleted file mode 100644
index ee4d99d1da0..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileW.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file StoreTile_TileW.cpp
-*
-* @brief Functionality for Store.
-*
-******************************************************************************/
-#include "StoreTile.h"
-
-void InitStoreTilesTable_TileW()
-{
-    InitStoreTilesTableStencil<SWR_TILE_MODE_WMAJOR>(sStoreTilesTableStencil);
-    // special color hot tile -> 8-bit WMAJOR
-    sStoreTilesTableColor[SWR_TILE_MODE_WMAJOR][R8_UINT] = StoreMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX.cpp
deleted file mode 100644
index 7f49a432e92..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file StoreTile_TIleX.cpp
-*
-* @brief Functionality for Store.
-*
-******************************************************************************/
-#include "StoreTile.h"
-
-void InitStoreTilesTable_TileX_1()
-{
-    InitStoreTilesTableColor_Half1<SWR_TILE_MODE_XMAJOR>(sStoreTilesTableColor);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX2.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX2.cpp
deleted file mode 100644
index 7e36ebececb..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX2.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file StoreTile_TIleX.cpp
-*
-* @brief Functionality for Store.
-*
-******************************************************************************/
-#include "StoreTile.h"
-
-void InitStoreTilesTable_TileX_2()
-{
-    InitStoreTilesTableColor_Half2<SWR_TILE_MODE_XMAJOR>(sStoreTilesTableColor);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY.cpp
deleted file mode 100644
index dade03f2523..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file StoreTile_TileY.cpp
-*
-* @brief Functionality for Store.
-*
-******************************************************************************/
-#include "StoreTile.h"
-
-void InitStoreTilesTable_TileY_1()
-{
-    InitStoreTilesTableColor_Half1<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableColor);
-    InitStoreTilesTableDepth<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableDepth);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY2.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY2.cpp
deleted file mode 100644
index b3ac76759fd..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY2.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file StoreTile_TileY.cpp
-*
-* @brief Functionality for Store.
-*
-******************************************************************************/
-#include "StoreTile.h"
-
-void InitStoreTilesTable_TileY_2()
-{
-    InitStoreTilesTableColor_Half2<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableColor);
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/SurfaceState.h b/src/gallium/drivers/swr/rasterizer/memory/SurfaceState.h
deleted file mode 100644
index 6b1b78eee46..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/SurfaceState.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2019 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file SurfaceState.h
-* 
-* @brief Common definitions for surface state
-* 
-******************************************************************************/
-#pragma once
-
-#include "core/state.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_SURFACE_STATE
-//////////////////////////////////////////////////////////////////////////
-struct SWR_SURFACE_STATE
-{
-    gfxptr_t         xpBaseAddress;
-    SWR_SURFACE_TYPE type;   // @llvm_enum
-    SWR_FORMAT       format; // @llvm_enum
-    uint32_t         width;
-    uint32_t         height;
-    uint32_t         depth;
-    uint32_t         numSamples;
-    uint32_t         samplePattern;
-    uint32_t         pitch;
-    uint32_t         qpitch;
-    uint32_t minLod; // for sampled surfaces, the most detailed LOD that can be accessed by sampler
-    uint32_t maxLod; // for sampled surfaces, the max LOD that can be accessed
-    float    resourceMinLod; // for sampled surfaces, the most detailed fractional mip that can be
-    // accessed by sampler
-    uint32_t lod;            // for render targets, the lod being rendered to
-    uint32_t arrayIndex; // for render targets, the array index being rendered to for arrayed surfaces
-    SWR_TILE_MODE tileMode; // @llvm_enum
-    uint32_t      halign;
-    uint32_t      valign;
-    uint32_t      xOffset;
-    uint32_t      yOffset;
-
-    uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces
-
-    gfxptr_t     xpAuxBaseAddress; // Used for compression, append/consume counter, etc.
-    SWR_AUX_MODE auxMode;          // @llvm_enum
-
-
-    bool bInterleavedSamples; // are MSAA samples stored interleaved or planar
-};
-\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
deleted file mode 100644
index 90143718eb8..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
+++ /dev/null
@@ -1,697 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file TilingFunctions.h
-* 
-* @brief Tiling functions.
-* 
-******************************************************************************/
-#pragma once
-
-#include "core/state.h"
-#include "core/format_traits.h"
-#include "memory/tilingtraits.h"
-#include "memory/SurfaceState.h"
-
-#include <algorithm>
-
-#define MAX_NUM_LOD 15
-
-#define GFX_ALIGN(x, a) (((x) + ((a) - 1)) - (((x) + ((a) - 1)) & ((a) - 1))) // Alt implementation with bitwise not (~) has issue with uint32 align used with 64-bit value, since ~'ed value will remain 32-bit.
-
-//////////////////////////////////////////////////////////////////////////
-/// SimdTile SSE(2x2), AVX(4x2), or AVX-512(4x4?)
-//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT HotTileFormat, SWR_FORMAT SrcOrDstFormat>
-struct SimdTile
-{
-    // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa )
-    float color[FormatTraits<HotTileFormat>::numComps][KNOB_SIMD_WIDTH];
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Retrieve color from simd.
-    /// @param index - linear index to color within simd.
-    /// @param outputColor - output color
-    INLINE void GetSwizzledColor(
-        uint32_t index,
-        float outputColor[4])
-    {
-        // SOA pattern for 2x2 is a subset of 4x2.
-        //   0 1 4 5
-        //   2 3 6 7
-        // The offset converts pattern to linear
-#if (SIMD_TILE_X_DIM == 4)
-        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
-#elif (SIMD_TILE_X_DIM == 2)
-        static const uint32_t offset[] = { 0, 1, 2, 3 };
-#endif
-
-        for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
-        {
-            outputColor[i] = this->color[FormatTraits<SrcOrDstFormat>::swizzle(i)][offset[index]];
-        }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Retrieve color from simd.
-    /// @param index - linear index to color within simd.
-    /// @param outputColor - output color
-    INLINE void SetSwizzledColor(
-        uint32_t index,
-        const float src[4])
-    {
-        // SOA pattern for 2x2 is a subset of 4x2.
-        //   0 1 4 5
-        //   2 3 6 7
-        // The offset converts pattern to linear
-#if (SIMD_TILE_X_DIM == 4)
-        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
-#elif (SIMD_TILE_X_DIM == 2)
-        static const uint32_t offset[] = { 0, 1, 2, 3 };
-#endif
-
-        // Only loop over the components needed for destination.
-        for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
-        {
-            this->color[i][offset[index]] = src[i];
-        }
-    }
-};
-
-template<>
-struct SimdTile <R8_UINT,R8_UINT>
-{
-    // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa )
-    uint8_t color[FormatTraits<R8_UINT>::numComps][KNOB_SIMD_WIDTH];
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Retrieve color from simd.
-    /// @param index - linear index to color within simd.
-    /// @param outputColor - output color
-    INLINE void GetSwizzledColor(
-        uint32_t index,
-        float outputColor[4])
-    {
-        // SOA pattern for 2x2 is a subset of 4x2.
-        //   0 1 4 5
-        //   2 3 6 7
-        // The offset converts pattern to linear
-#if (SIMD_TILE_X_DIM == 4)
-        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
-#elif (SIMD_TILE_X_DIM == 2)
-        static const uint32_t offset[] = { 0, 1, 2, 3 };
-#endif
-
-        for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
-        {
-            uint32_t src = this->color[FormatTraits<R8_UINT>::swizzle(i)][offset[index]];
-            outputColor[i] = *(float*)&src;
-        }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Retrieve color from simd.
-    /// @param index - linear index to color within simd.
-    /// @param outputColor - output color
-    INLINE void SetSwizzledColor(
-        uint32_t index,
-        const float src[4])
-    {
-        // SOA pattern for 2x2 is a subset of 4x2.
-        //   0 1 4 5
-        //   2 3 6 7
-        // The offset converts pattern to linear
-#if (SIMD_TILE_X_DIM == 4)
-        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
-#elif (SIMD_TILE_X_DIM == 2)
-        static const uint32_t offset[] = { 0, 1, 2, 3 };
-#endif
-
-        // Only loop over the components needed for destination.
-        for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
-        {
-            this->color[i][offset[index]] = *(uint8_t*)&src[i];
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SimdTile 8x2 for AVX-512
-//////////////////////////////////////////////////////////////////////////
-
-template<SWR_FORMAT HotTileFormat, SWR_FORMAT SrcOrDstFormat>
-struct SimdTile_16
-{
-    // SimdTile is SOA (e.g. rrrrrrrrrrrrrrrr gggggggggggggggg bbbbbbbbbbbbbbbb aaaaaaaaaaaaaaaa )
-    float color[FormatTraits<HotTileFormat>::numComps][KNOB_SIMD16_WIDTH];
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Retrieve color from simd.
-    /// @param index - linear index to color within simd.
-    /// @param outputColor - output color
-    INLINE void GetSwizzledColor(
-        uint32_t index,
-        float outputColor[4])
-    {
-        // SOA pattern for 8x2..
-        //   0 1 4 5 8 9 C D
-        //   2 3 6 7 A B E F
-        // The offset converts pattern to linear
-        static const uint32_t offset[KNOB_SIMD16_WIDTH] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
-
-        for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
-        {
-            outputColor[i] = this->color[FormatTraits<SrcOrDstFormat>::swizzle(i)][offset[index]];
-        }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Retrieve color from simd.
-    /// @param index - linear index to color within simd.
-    /// @param outputColor - output color
-    INLINE void SetSwizzledColor(
-        uint32_t index,
-        const float src[4])
-    {
-        // SOA pattern for 8x2..
-        //   0 1 4 5 8 9 C D
-        //   2 3 6 7 A B E F
-        // The offset converts pattern to linear
-        static const uint32_t offset[KNOB_SIMD16_WIDTH] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
-
-        for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
-        {
-            this->color[i][offset[index]] = src[i];
-        }
-    }
-};
-
-template<>
-struct SimdTile_16 <R8_UINT, R8_UINT>
-{
-    // SimdTile is SOA (e.g. rrrrrrrrrrrrrrrr gggggggggggggggg bbbbbbbbbbbbbbbb aaaaaaaaaaaaaaaa )
-    uint8_t color[FormatTraits<R8_UINT>::numComps][KNOB_SIMD16_WIDTH];
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Retrieve color from simd.
-    /// @param index - linear index to color within simd.
-    /// @param outputColor - output color
-    INLINE void GetSwizzledColor(
-        uint32_t index,
-        float outputColor[4])
-    {
-        // SOA pattern for 8x2..
-        //   0 1 4 5 8 9 C D
-        //   2 3 6 7 A B E F
-        // The offset converts pattern to linear
-        static const uint32_t offset[KNOB_SIMD16_WIDTH] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
-
-        for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
-        {
-            uint32_t src = this->color[FormatTraits<R8_UINT>::swizzle(i)][offset[index]];
-            outputColor[i] = *(float*)&src;
-        }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Retrieve color from simd.
-    /// @param index - linear index to color within simd.
-    /// @param outputColor - output color
-    INLINE void SetSwizzledColor(
-        uint32_t index,
-        const float src[4])
-    {
-        // SOA pattern for 8x2..
-        //   0 1 4 5 8 9 C D
-        //   2 3 6 7 A B E F
-        // The offset converts pattern to linear
-        static const uint32_t offset[KNOB_SIMD16_WIDTH] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
-
-        for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
-        {
-            this->color[i][offset[index]] = *(uint8_t*)&src[i];
-        }
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes lod offset for 1D surface at specified lod.
-/// @param baseWidth - width of basemip (mip 0).
-/// @param hAlign - horizontal alignment per miip, in texels
-/// @param lod - lod index
-/// @param offset - output offset.
-INLINE void ComputeLODOffset1D(
-    const SWR_FORMAT_INFO& info,
-    uint32_t baseWidth,
-    uint32_t hAlign,
-    uint32_t lod,
-    uint32_t &offset)
-{
-    if (lod == 0)
-    {
-        offset = 0;
-    }
-    else
-    {
-        uint32_t curWidth = baseWidth;
-        // @note hAlign is already in blocks for compressed formats so upconvert
-        //       so that we have the desired alignment post-divide.
-        if (info.isBC)
-        {
-            hAlign *= info.bcWidth;
-        }
-
-        offset = GFX_ALIGN(curWidth, hAlign);
-        for (uint32_t l = 1; l < lod; ++l)
-        {
-            curWidth = std::max<uint32_t>(curWidth >> 1, 1U);
-            offset += GFX_ALIGN(curWidth, hAlign);
-        }
-
-        if (info.isSubsampled || info.isBC)
-        {
-            offset /= info.bcWidth;
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes x lod offset for 2D surface at specified lod.
-/// @param baseWidth - width of basemip (mip 0).
-/// @param hAlign - horizontal alignment per mip, in texels
-/// @param lod - lod index
-/// @param offset - output offset.
-INLINE void ComputeLODOffsetX(
-    const SWR_FORMAT_INFO& info,
-    uint32_t baseWidth,
-    uint32_t hAlign,
-    uint32_t lod,
-    uint32_t &offset)
-{
-    if (lod < 2)
-    {
-        offset = 0;
-    }
-    else
-    {
-        uint32_t curWidth = baseWidth;
-        // @note hAlign is already in blocks for compressed formats so upconvert
-        //       so that we have the desired alignment post-divide.
-        if (info.isBC)
-        {
-            hAlign *= info.bcWidth;
-        }
-
-        curWidth = std::max<uint32_t>(curWidth >> 1, 1U);
-        curWidth = GFX_ALIGN(curWidth, hAlign);
-
-        if (info.isSubsampled || info.isBC)
-        {
-            curWidth /= info.bcWidth;
-        }
-
-        offset = curWidth;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes y lod offset for 2D surface at specified lod.
-/// @param baseWidth - width of basemip (mip 0).
-/// @param vAlign - vertical alignment per mip, in rows
-/// @param lod - lod index
-/// @param offset - output offset.
-INLINE void ComputeLODOffsetY(
-    const SWR_FORMAT_INFO& info,
-    uint32_t baseHeight,
-    uint32_t vAlign,
-    uint32_t lod,
-    uint32_t &offset)
-{
-    if (lod == 0)
-    {
-        offset = 0;
-    }
-    else
-    {
-        offset = 0;
-        uint32_t mipHeight = baseHeight;
-
-        // @note vAlign is already in blocks for compressed formats so upconvert
-        //       so that we have the desired alignment post-divide.
-        if (info.isBC)
-        {
-            vAlign *= info.bcHeight;
-        }
-
-        for (uint32_t l = 1; l <= lod; ++l)
-        {
-            uint32_t alignedMipHeight = GFX_ALIGN(mipHeight, vAlign);
-            offset += ((l != 2) ? alignedMipHeight : 0);
-            mipHeight = std::max<uint32_t>(mipHeight >> 1, 1U);
-        }
-
-        if (info.isBC)
-        {
-            offset /= info.bcHeight;
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes 1D surface offset
-/// @param x - offset from start of array slice at given lod.
-/// @param array - array slice index
-/// @param lod - lod index
-/// @param pState - surface state
-/// @param xOffsetBytes - output offset in bytes.
-template<bool UseCachedOffsets>
-INLINE void ComputeSurfaceOffset1D(
-    uint32_t x,
-    uint32_t array,
-    uint32_t lod,
-    const SWR_SURFACE_STATE *pState,
-    uint32_t &xOffsetBytes)
-{
-    const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
-    uint32_t lodOffset;
-
-    if (UseCachedOffsets)
-    {
-        lodOffset = pState->lodOffsets[0][lod];
-    }
-    else
-    {
-        ComputeLODOffset1D(info, pState->width, pState->halign, lod, lodOffset);
-    }
-
-    xOffsetBytes = (array * pState->qpitch + lodOffset + x) * info.Bpp;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Adjusts the array slice for legacy TileY MSAA
-/// @param pState - surface state
-/// @param array - array slice index
-/// @param sampleNum - requested sample
-INLINE void AdjustCoordsForMSAA(const SWR_SURFACE_STATE *pState, uint32_t& x, uint32_t& y, uint32_t& arrayIndex, uint32_t sampleNum)
-{
-    /// @todo: might want to templatize adjusting for sample slices when we support tileYS/tileYF.
-    if((pState->tileMode == SWR_TILE_MODE_YMAJOR ||
-        pState->tileMode == SWR_TILE_MODE_WMAJOR) && 
-       pState->bInterleavedSamples)
-    {
-        uint32_t newX, newY, newSampleX, newSampleY;
-        switch(pState->numSamples)
-        {
-        case 1:
-            newX = x;
-            newY = y;
-            newSampleX = newSampleY = 0;
-            break;
-        case 2:
-        {
-            assert(pState->type == SURFACE_2D);
-            static const uint32_t xMask = 0xFFFFFFFD;
-            static const uint32_t sampleMaskX = 0x1;
-            newX = pdep_u32(x, xMask);
-            newY = y;
-            newSampleX = pext_u32(sampleNum, sampleMaskX);
-            newSampleY = 0;
-        }
-            break;
-        case 4:
-        {
-            assert(pState->type == SURFACE_2D);
-            static const uint32_t mask = 0xFFFFFFFD;
-            static const uint32_t sampleMaskX = 0x1;
-            static const uint32_t sampleMaskY = 0x2;
-            newX = pdep_u32(x, mask);
-            newY = pdep_u32(y, mask);
-            newSampleX = pext_u32(sampleNum, sampleMaskX);
-            newSampleY = pext_u32(sampleNum, sampleMaskY);
-        }
-            break;
-        case 8:
-        {
-            assert(pState->type == SURFACE_2D);
-            static const uint32_t xMask = 0xFFFFFFF9;
-            static const uint32_t yMask = 0xFFFFFFFD;
-            static const uint32_t sampleMaskX = 0x5;
-            static const uint32_t sampleMaskY = 0x2;
-            newX = pdep_u32(x, xMask);
-            newY = pdep_u32(y, yMask);
-            newSampleX = pext_u32(sampleNum, sampleMaskX);
-            newSampleY = pext_u32(sampleNum, sampleMaskY);
-        }
-            break;
-        case 16:
-        {
-            assert(pState->type == SURFACE_2D);
-            static const uint32_t mask = 0xFFFFFFF9;
-            static const uint32_t sampleMaskX = 0x5;
-            static const uint32_t sampleMaskY = 0xA;
-            newX = pdep_u32(x, mask);
-            newY = pdep_u32(y, mask);
-            newSampleX = pext_u32(sampleNum, sampleMaskX);
-            newSampleY = pext_u32(sampleNum, sampleMaskY);
-        }
-            break;
-        default:
-            assert(0 && "Unsupported sample count");
-            newX = newY = 0;
-            newSampleX = newSampleY = 0;
-            break;
-        }
-        x = newX | (newSampleX << 1);
-        y = newY | (newSampleY << 1);
-    }
-    else if(pState->tileMode == SWR_TILE_MODE_YMAJOR ||
-            pState->tileMode == SWR_TILE_NONE)
-    {
-        uint32_t sampleShift;
-        switch(pState->numSamples)
-        {
-        case 1:
-            assert(sampleNum == 0);
-            sampleShift = 0;
-            break;
-        case 2:
-            assert(pState->type == SURFACE_2D);
-            sampleShift = 1;
-            break;
-        case 4:
-            assert(pState->type == SURFACE_2D);
-            sampleShift = 2;
-            break;
-        case 8:
-            assert(pState->type == SURFACE_2D);
-            sampleShift = 3;
-            break;
-        case 16:
-            assert(pState->type == SURFACE_2D);
-            sampleShift = 4;
-            break;
-        default:
-            assert(0 && "Unsupported sample count");
-            sampleShift = 0;
-            break;
-        }
-        arrayIndex = (arrayIndex << sampleShift) | sampleNum;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes 2D surface offset
-/// @param x - horizontal offset from start of array slice and lod.
-/// @param y - vertical offset from start of array slice and lod.
-/// @param array - array slice index
-/// @param lod - lod index
-/// @param pState - surface state
-/// @param xOffsetBytes - output x offset in bytes.
-/// @param yOffsetRows - output y offset in bytes.
-template<bool UseCachedOffsets>
-INLINE void ComputeSurfaceOffset2D(uint32_t x, uint32_t y, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows)
-{
-    const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
-    uint32_t lodOffsetX, lodOffsetY;
-
-    if (UseCachedOffsets)
-    {
-        lodOffsetX = pState->lodOffsets[0][lod];
-        lodOffsetY = pState->lodOffsets[1][lod];
-    }
-    else
-    {
-        ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX);
-        ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY);
-    }
-
-    AdjustCoordsForMSAA(pState, x, y, array, sampleNum);
-    xOffsetBytes = (x + lodOffsetX + pState->xOffset) * info.Bpp;
-    yOffsetRows = (array * pState->qpitch) + lodOffsetY + y + pState->yOffset;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes 3D surface offset
-/// @param x - horizontal offset from start of array slice and lod.
-/// @param y - vertical offset from start of array slice and lod.
-/// @param z - depth offset from start of array slice and lod.
-/// @param lod - lod index
-/// @param pState - surface state
-/// @param xOffsetBytes - output x offset in bytes.
-/// @param yOffsetRows - output y offset in rows.
-/// @param zOffsetSlices - output y offset in slices.
-template<bool UseCachedOffsets>
-INLINE void ComputeSurfaceOffset3D(uint32_t x, uint32_t y, uint32_t z, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows, uint32_t &zOffsetSlices)
-{
-    const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
-    uint32_t lodOffsetX, lodOffsetY;
-
-    if (UseCachedOffsets)
-    {
-        lodOffsetX = pState->lodOffsets[0][lod];
-        lodOffsetY = pState->lodOffsets[1][lod];
-    }
-    else
-    {
-        ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX);
-        ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY);
-    }
-
-    xOffsetBytes = (x + lodOffsetX) * info.Bpp;
-    yOffsetRows = lodOffsetY + y;
-    zOffsetSlices = z;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
-///        and returns final surface address
-/// @param xOffsetBytes - x offset from base of surface in bytes
-/// @param yOffsetRows - y offset from base of surface in rows
-/// @param pState - pointer to the surface state
-template<typename TTraits>
-INLINE uint32_t ComputeTileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState)
-{
-    return ComputeOffset2D<TTraits>(pState->pitch, xOffsetBytes, yOffsetRows);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
-///        and returns final surface address
-/// @param xOffsetBytes - x offset from base of surface in bytes
-/// @param yOffsetRows - y offset from base of surface in rows
-/// @param pState - pointer to the surface state
-template<typename TTraits>
-INLINE uint32_t ComputeTileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState)
-{
-    return ComputeOffset3D<TTraits>(pState->qpitch, pState->pitch, xOffsetBytes, yOffsetRows, zOffsetSlices);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
-///        and returns final surface address
-/// @param xOffsetBytes - x offset from base of surface in bytes
-/// @param yOffsetRows - y offset from base of surface in rows
-/// @param pState - pointer to the surface state
-INLINE
-uint32_t TileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState)
-{
-    switch (pState->tileMode)
-    {
-    case SWR_TILE_NONE: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, pState);
-    case SWR_TILE_SWRZ: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, pState);
-    case SWR_TILE_MODE_XMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_XMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState);
-    case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, pState);
-    case SWR_TILE_MODE_WMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_WMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState);
-    default: SWR_INVALID("Unsupported tiling mode");
-    }
-    return 0;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Swizzles the linear x,y,z offsets depending on surface tiling mode
-///        and returns final surface address
-/// @param xOffsetBytes - x offset from base of surface in bytes
-/// @param yOffsetRows - y offset from base of surface in rows
-/// @param zOffsetSlices - z offset from base of surface in slices
-/// @param pState - pointer to the surface state
-INLINE
-uint32_t TileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState)
-{
-    switch (pState->tileMode)
-    {
-    case SWR_TILE_NONE: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
-    case SWR_TILE_SWRZ: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
-    case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
-    default: SWR_INVALID("Unsupported tiling mode");
-    }
-    return 0;
-}
-
-template<bool UseCachedOffsets>
-INLINE
-uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
-{
-    uint32_t offsetX = 0, offsetY = 0, offsetZ = 0;
-    switch (pState->type)
-    {
-    case SURFACE_BUFFER:
-    case SURFACE_STRUCTURED_BUFFER:
-        offsetX = x * pState->pitch;
-        return offsetX;
-        break;
-    case SURFACE_1D:
-        ComputeSurfaceOffset1D<UseCachedOffsets>(x, array, lod, pState, offsetX);
-        return TileSwizzle2D(offsetX, 0, pState);
-        break;
-    case SURFACE_2D:
-        ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY);
-        return TileSwizzle2D(offsetX, offsetY, pState);
-    case SURFACE_3D:
-        ComputeSurfaceOffset3D<UseCachedOffsets>(x, y, z, lod, pState, offsetX, offsetY, offsetZ);
-        return TileSwizzle3D(offsetX, offsetY, offsetZ, pState);
-        break;
-    case SURFACE_CUBE:
-        ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY);
-        return TileSwizzle2D(offsetX, offsetY, pState);
-        break;
-    default: SWR_INVALID("Unsupported format");
-    }
-
-    return 0;
-}
-
-typedef void*(*PFN_COMPUTESURFADDR)(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, const SWR_SURFACE_STATE*);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes surface address at the given location and lod
-/// @param x - x location in pixels
-/// @param y - y location in rows
-/// @param z - z location for 3D surfaces
-/// @param array - array slice for 1D and 2D surfaces
-/// @param lod - level of detail
-/// @param pState - pointer to the surface state
-template<bool UseCachedOffsets, bool IsRead>
-INLINE
-void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
-{
-    return (void*)(pState->xpBaseAddress + ComputeSurfaceOffset<UseCachedOffsets>(x, y, z, array, sampleNum, lod, pState));
-}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
deleted file mode 100644
index c2a87d85dd1..00000000000
--- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
+++ /dev/null
@@ -1,207 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file tilingtraits.h
-* 
-* @brief Tiling traits.
-* 
-******************************************************************************/
-#pragma once
-
-#include "core/state.h"
-#include "common/intrin.h"
-
-template<SWR_TILE_MODE mode, int>
-struct TilingTraits
-{
-    static const SWR_TILE_MODE TileMode{ mode };
-    static UINT GetCu() { SWR_NOT_IMPL; return 0; }
-    static UINT GetCv() { SWR_NOT_IMPL; return 0; }
-    static UINT GetCr() { SWR_NOT_IMPL; return 0; }
-    static UINT GetTileIDShift() { SWR_NOT_IMPL; return 0; }
-
-    /// @todo correct pdep shifts for all rastertile dims.  Unused for now
-    static UINT GetPdepX() { SWR_NOT_IMPL; return 0x37; }
-    static UINT GetPdepY() { SWR_NOT_IMPL; return 0xC8; }
-};
-
-template<int X> struct TilingTraits <SWR_TILE_NONE, X>
-{
-    static const SWR_TILE_MODE TileMode{ SWR_TILE_NONE };
-    static UINT GetCu() { return 0; }
-    static UINT GetCv() { return 0; }
-    static UINT GetCr() { return 0; }
-    static UINT GetTileIDShift() { return 0; }
-    static UINT GetPdepX() { return 0x00; }
-    static UINT GetPdepY() { return 0x00; }
-};
-
-template<> struct TilingTraits <SWR_TILE_SWRZ, 8>
-{
-    static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
-    static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT; }
-    static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
-    static UINT GetCr() { return 0; }
-    static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT; }
-
-    /// @todo correct pdep shifts for all rastertile dims.  Unused for now
-    static UINT GetPdepX() { SWR_NOT_IMPL; return 0x00; }
-    static UINT GetPdepY() { SWR_NOT_IMPL; return 0x00; }
-};
-
-template<> struct TilingTraits <SWR_TILE_SWRZ, 32>
-{
-    static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
-    static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 2; }
-    static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
-    static UINT GetCr() { return 0; }
-    static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 2; }
-
-    static UINT GetPdepX() { return 0x37; }
-    static UINT GetPdepY() { return 0xC8; }
-};
-
-template<> struct TilingTraits <SWR_TILE_SWRZ, 128>
-{
-    static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
-    static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 4; }
-    static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
-    static UINT GetCr() { return 0; }
-    static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 4; }
-
-    /// @todo correct pdep shifts for all rastertile dims.  Unused for now
-    static UINT GetPdepX() { SWR_NOT_IMPL; return 0x37; }
-    static UINT GetPdepY() { SWR_NOT_IMPL; return 0xC8; }
-};
-
-// y-major tiling layout unaffected by element size
-template<int X> struct TilingTraits <SWR_TILE_MODE_YMAJOR, X>
-{
-    static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_YMAJOR };
-    static UINT GetCu() { return 7; }
-    static UINT GetCv() { return 5; }
-    static UINT GetCr() { return 0; }
-    static UINT GetTileIDShift() { return 12; }
-
-    static UINT GetPdepX() { return 0xe0f; }
-    static UINT GetPdepY() { return 0x1f0; }
-};
-
-// x-major tiling layout unaffected by element size
-template<int X> struct TilingTraits <SWR_TILE_MODE_XMAJOR, X>
-{
-    static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_XMAJOR };
-    static UINT GetCu() { return 9; }
-    static UINT GetCv() { return 3; }
-    static UINT GetCr() { return 0; }
-    static UINT GetTileIDShift() { return 12; }
-
-    static UINT GetPdepX() { return 0x1ff; }
-    static UINT GetPdepY() { return 0xe00; }
-};
-
-template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X>
-{
-    static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_WMAJOR };
-    static UINT GetCu() { return 6; }
-    static UINT GetCv() { return 6; }
-    static UINT GetCr() { return 0; }
-    static UINT GetTileIDShift() { return 12; }
-
-    static UINT GetPdepX() { return 0xe15; }
-    static UINT GetPdepY() { return 0x1ea; }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes the tileID for 2D tiled surfaces
-/// @param pitch - surface pitch in bytes
-/// @param tileX - x offset in tiles
-/// @param tileY - y offset in tiles
-template<typename TTraits>
-INLINE UINT ComputeTileOffset2D(UINT pitch, UINT tileX, UINT tileY)
-{
-    UINT tileID = tileY * (pitch >> TTraits::GetCu()) + tileX;
-    return tileID << TTraits::GetTileIDShift();
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes the tileID for 3D tiled surfaces
-/// @param qpitch - surface qpitch in rows
-/// @param pitch - surface pitch in bytes
-/// @param tileX - x offset in tiles
-/// @param tileY - y offset in tiles
-/// @param tileZ - y offset in tiles
-template<typename TTraits>
-INLINE UINT ComputeTileOffset3D(UINT qpitch, UINT pitch, UINT tileX, UINT tileY, UINT tileZ)
-{
-    UINT tileID = (tileZ * (qpitch >> TTraits::GetCv()) + tileY) * (pitch >> TTraits::GetCu()) + tileX;
-    return tileID << TTraits::GetTileIDShift();
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes the byte offset for 2D tiled surfaces
-/// @param pitch - surface pitch in bytes
-/// @param x - x offset in bytes
-/// @param y - y offset in rows
-template<typename TTraits>
-INLINE UINT ComputeOffset2D(UINT pitch, UINT x, UINT y)
-{
-    UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv());
-    UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX());
-    UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY());
-    return (tileID | xSwizzle | ySwizzle);
-}
-
-#if KNOB_ARCH <= KNOB_ARCH_AVX
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes the byte offset for 2D tiled surfaces. Specialization
-///        for tile-y surfaces that uses bit twiddling instead of pdep emulation.
-/// @param pitch - surface pitch in bytes
-/// @param x - x offset in bytes
-/// @param y - y offset in rows
-template<>
-INLINE UINT ComputeOffset2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(UINT pitch, UINT x, UINT y)
-{
-    typedef TilingTraits<SWR_TILE_MODE_YMAJOR, 32> TTraits;
-
-    UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv());
-    UINT xSwizzle = ((x << 5) & 0xe00) | (x & 0xf);
-    UINT ySwizzle = (y << 4) & 0x1f0;
-    return (tileID | xSwizzle | ySwizzle);
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Computes the byte offset for 3D tiled surfaces
-/// @param qpitch - depth pitch in rows
-/// @param pitch - surface pitch in bytes
-/// @param x - x offset in bytes
-/// @param y - y offset in rows
-/// @param z - y offset in slices
-template<typename TTraits>
-INLINE UINT ComputeOffset3D(UINT qpitch, UINT pitch, UINT x, UINT y, UINT z)
-{
-    UINT tileID = ComputeTileOffset3D<TTraits>(qpitch, pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv(), z >> TTraits::GetCr());
-    UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX());
-    UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY());
-    return (tileID | xSwizzle | ySwizzle);
-}
diff --git a/src/gallium/drivers/swr/swr_clear.cpp b/src/gallium/drivers/swr/swr_clear.cpp
deleted file mode 100644
index d579cbdde9f..00000000000
--- a/src/gallium/drivers/swr/swr_clear.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include "swr_context.h"
-#include "swr_query.h"
-
-static void
-swr_clear(struct pipe_context *pipe,
-          unsigned buffers,
-          const struct pipe_scissor_state *scissor_state,
-          const union pipe_color_union *color,
-          double depth,
-          unsigned stencil)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   struct pipe_framebuffer_state *fb = &ctx->framebuffer;
-
-   UINT clearMask = 0;
-   unsigned layers = 0;
-
-   if (!swr_check_render_cond(pipe))
-      return;
-
-   swr_update_derived(pipe);
-
-   if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
-      for (unsigned i = 0; i < fb->nr_cbufs; ++i)
-         if (fb->cbufs[i] && (buffers & (PIPE_CLEAR_COLOR0 << i))) {
-            clearMask |= (SWR_ATTACHMENT_COLOR0_BIT << i);
-            layers = std::max(layers, fb->cbufs[i]->u.tex.last_layer -
-                                      fb->cbufs[i]->u.tex.first_layer + 1u);
-         }
-   }
-
-   if (buffers & PIPE_CLEAR_DEPTH && fb->zsbuf) {
-      clearMask |= SWR_ATTACHMENT_DEPTH_BIT;
-      layers = std::max(layers, fb->zsbuf->u.tex.last_layer -
-                                fb->zsbuf->u.tex.first_layer + 1u);
-   }
-
-   if (buffers & PIPE_CLEAR_STENCIL && fb->zsbuf) {
-      clearMask |= SWR_ATTACHMENT_STENCIL_BIT;
-      layers = std::max(layers, fb->zsbuf->u.tex.last_layer -
-                                fb->zsbuf->u.tex.first_layer + 1u);
-   }
-
-#if 0 // XXX HACK, override clear color alpha. On ubuntu, clears are
-      // transparent.
-   ((union pipe_color_union *)color)->f[3] = 1.0; /* cast off your const'd-ness */
-#endif
-
-   /* 
-    * Always clear full surface. When GL_SCISSOR_TEST is enabled
-    * glClear is handled by state tracker and there is no need to do this here
-    */
-   SWR_RECT clear_rect = {0, 0, (int32_t)fb->width, (int32_t)fb->height};
-
-   for (unsigned i = 0; i < layers; ++i) {
-      swr_update_draw_context(ctx);
-      ctx->api.pfnSwrClearRenderTarget(ctx->swrContext, clearMask, i,
-                                       color->f, depth, stencil,
-                                       clear_rect);
-
-      // Mask out the attachments that are out of layers.
-      if (fb->zsbuf &&
-          (fb->zsbuf->u.tex.last_layer <= fb->zsbuf->u.tex.first_layer + i))
-         clearMask &= ~(SWR_ATTACHMENT_DEPTH_BIT | SWR_ATTACHMENT_STENCIL_BIT);
-      for (unsigned c = 0; c < fb->nr_cbufs; ++c) {
-         const struct pipe_surface *sf = fb->cbufs[c];
-         if (sf && (sf->u.tex.last_layer <= sf->u.tex.first_layer + i))
-            clearMask &= ~(SWR_ATTACHMENT_COLOR0_BIT << c);
-      }
-   }
-}
-
-void
-swr_clear_init(struct pipe_context *pipe)
-{
-   pipe->clear = swr_clear;
-}
diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
deleted file mode 100644
index 08637dba1d5..00000000000
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ /dev/null
@@ -1,595 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include "swr_context.h"
-#include "swr_memory.h"
-#include "swr_screen.h"
-#include "swr_resource.h"
-#include "swr_scratch.h"
-#include "swr_query.h"
-#include "swr_fence.h"
-
-#include "util/u_memory.h"
-#include "util/u_inlines.h"
-#include "util/format/u_format.h"
-#include "util/u_atomic.h"
-#include "util/u_upload_mgr.h"
-#include "util/u_transfer.h"
-#include "util/u_surface.h"
-
-#include "api.h"
-#include "backend.h"
-#include "knobs.h"
-
-static struct pipe_surface *
-swr_create_surface(struct pipe_context *pipe,
-                   struct pipe_resource *pt,
-                   const struct pipe_surface *surf_tmpl)
-{
-   struct pipe_surface *ps;
-
-   ps = CALLOC_STRUCT(pipe_surface);
-   if (ps) {
-      pipe_reference_init(&ps->reference, 1);
-      pipe_resource_reference(&ps->texture, pt);
-      ps->context = pipe;
-      ps->format = surf_tmpl->format;
-      if (pt->target != PIPE_BUFFER) {
-         assert(surf_tmpl->u.tex.level <= pt->last_level);
-         ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level);
-         ps->height = u_minify(pt->height0, surf_tmpl->u.tex.level);
-         ps->u.tex.level = surf_tmpl->u.tex.level;
-         ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
-         ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
-      } else {
-         /* setting width as number of elements should get us correct
-          * renderbuffer width */
-         ps->width = surf_tmpl->u.buf.last_element
-            - surf_tmpl->u.buf.first_element + 1;
-         ps->height = pt->height0;
-         ps->u.buf.first_element = surf_tmpl->u.buf.first_element;
-         ps->u.buf.last_element = surf_tmpl->u.buf.last_element;
-         assert(ps->u.buf.first_element <= ps->u.buf.last_element);
-         assert(ps->u.buf.last_element < ps->width);
-      }
-   }
-   return ps;
-}
-
-static void
-swr_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surf)
-{
-   assert(surf->texture);
-   struct pipe_resource *resource = surf->texture;
-
-   /* If the resource has been drawn to, store tiles. */
-   swr_store_dirty_resource(pipe, resource, SWR_TILE_RESOLVED);
-
-   pipe_resource_reference(&resource, NULL);
-   FREE(surf);
-}
-
-
-static void *
-swr_transfer_map(struct pipe_context *pipe,
-                 struct pipe_resource *resource,
-                 unsigned level,
-                 unsigned usage,
-                 const struct pipe_box *box,
-                 struct pipe_transfer **transfer)
-{
-   struct swr_screen *screen = swr_screen(pipe->screen);
-   struct swr_resource *spr = swr_resource(resource);
-   struct pipe_transfer *pt;
-   enum pipe_format format = resource->format;
-
-   assert(resource);
-   assert(level <= resource->last_level);
-
-   /* If mapping an attached rendertarget, store tiles to surface and set
-    * postStoreTileState to SWR_TILE_INVALID so tiles get reloaded on next use
-    * and nothing needs to be done at unmap. */
-   swr_store_dirty_resource(pipe, resource, SWR_TILE_INVALID);
-
-   if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {
-      /* If resource is in use, finish fence before mapping.
-       * Unless requested not to block, then if not done return NULL map */
-      if (usage & PIPE_MAP_DONTBLOCK) {
-         if (swr_is_fence_pending(screen->flush_fence))
-            return NULL;
-      } else {
-         if (spr->status) {
-            /* But, if there's no fence pending, submit one.
-             * XXX: Remove once draw timestamps are finished. */
-            if (!swr_is_fence_pending(screen->flush_fence))
-               swr_fence_submit(swr_context(pipe), screen->flush_fence);
-
-            swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0);
-            swr_resource_unused(resource);
-         }
-      }
-   }
-
-   pt = CALLOC_STRUCT(pipe_transfer);
-   if (!pt)
-      return NULL;
-   pipe_resource_reference(&pt->resource, resource);
-   pt->usage = (pipe_map_flags)usage;
-   pt->level = level;
-   pt->box = *box;
-   pt->stride = spr->swr.pitch;
-   pt->layer_stride = spr->swr.qpitch * spr->swr.pitch;
-
-   /* if we're mapping the depth/stencil, copy in stencil for the section
-    * being read in
-    */
-   if (usage & PIPE_MAP_READ && spr->has_depth && spr->has_stencil) {
-      size_t zbase, sbase;
-      for (int z = box->z; z < box->z + box->depth; z++) {
-         zbase = (z * spr->swr.qpitch + box->y) * spr->swr.pitch +
-            spr->mip_offsets[level];
-         sbase = (z * spr->secondary.qpitch + box->y) * spr->secondary.pitch +
-            spr->secondary_mip_offsets[level];
-         for (int y = box->y; y < box->y + box->height; y++) {
-            if (spr->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
-               for (int x = box->x; x < box->x + box->width; x++)
-                  ((uint8_t*)(spr->swr.xpBaseAddress))[zbase + 4 * x + 3] =
-                     ((uint8_t*)(spr->secondary.xpBaseAddress))[sbase + x];
-            } else if (spr->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
-               for (int x = box->x; x < box->x + box->width; x++)
-                  ((uint8_t*)(spr->swr.xpBaseAddress))[zbase + 8 * x + 4] =
-                     ((uint8_t*)(spr->secondary.xpBaseAddress))[sbase + x];
-            }
-            zbase += spr->swr.pitch;
-            sbase += spr->secondary.pitch;
-         }
-      }
-   }
-
-   unsigned offset = box->z * pt->layer_stride +
-      util_format_get_nblocksy(format, box->y) * pt->stride +
-      util_format_get_stride(format, box->x);
-
-   *transfer = pt;
-
-   return (void*)(spr->swr.xpBaseAddress + offset + spr->mip_offsets[level]);
-}
-
-static void
-swr_transfer_flush_region(struct pipe_context *pipe,
-                          struct pipe_transfer *transfer,
-                          const struct pipe_box *flush_box)
-{
-   assert(transfer->resource);
-   assert(transfer->usage & PIPE_MAP_WRITE);
-
-   struct swr_resource *spr = swr_resource(transfer->resource);
-   if (!spr->has_depth || !spr->has_stencil)
-      return;
-
-   size_t zbase, sbase;
-   struct pipe_box box = *flush_box;
-   box.x += transfer->box.x;
-   box.y += transfer->box.y;
-   box.z += transfer->box.z;
-   for (int z = box.z; z < box.z + box.depth; z++) {
-      zbase = (z * spr->swr.qpitch + box.y) * spr->swr.pitch +
-         spr->mip_offsets[transfer->level];
-      sbase = (z * spr->secondary.qpitch + box.y) * spr->secondary.pitch +
-         spr->secondary_mip_offsets[transfer->level];
-      for (int y = box.y; y < box.y + box.height; y++) {
-         if (spr->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
-            for (int x = box.x; x < box.x + box.width; x++)
-               ((uint8_t*)(spr->secondary.xpBaseAddress))[sbase + x] =
-                  ((uint8_t*)(spr->swr.xpBaseAddress))[zbase + 4 * x + 3];
-         } else if (spr->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
-            for (int x = box.x; x < box.x + box.width; x++)
-               ((uint8_t*)(spr->secondary.xpBaseAddress))[sbase + x] =
-                  ((uint8_t*)(spr->swr.xpBaseAddress))[zbase + 8 * x + 4];
-         }
-         zbase += spr->swr.pitch;
-         sbase += spr->secondary.pitch;
-      }
-   }
-}
-
-static void
-swr_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *transfer)
-{
-   assert(transfer->resource);
-
-   struct swr_resource *spr = swr_resource(transfer->resource);
-   /* if we're mapping the depth/stencil, copy in stencil for the section
-    * being written out
-    */
-   if (transfer->usage & PIPE_MAP_WRITE &&
-       !(transfer->usage & PIPE_MAP_FLUSH_EXPLICIT) &&
-       spr->has_depth && spr->has_stencil) {
-      struct pipe_box box;
-      u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height,
-               transfer->box.depth, &box);
-      swr_transfer_flush_region(pipe, transfer, &box);
-   }
-
-   pipe_resource_reference(&transfer->resource, NULL);
-   FREE(transfer);
-}
-
-
-static void
-swr_resource_copy(struct pipe_context *pipe,
-                  struct pipe_resource *dst,
-                  unsigned dst_level,
-                  unsigned dstx,
-                  unsigned dsty,
-                  unsigned dstz,
-                  struct pipe_resource *src,
-                  unsigned src_level,
-                  const struct pipe_box *src_box)
-{
-   struct swr_screen *screen = swr_screen(pipe->screen);
-
-   /* If either the src or dst is a renderTarget, store tiles before copy */
-   swr_store_dirty_resource(pipe, src, SWR_TILE_RESOLVED);
-   swr_store_dirty_resource(pipe, dst, SWR_TILE_RESOLVED);
-
-   swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0);
-   swr_resource_unused(src);
-   swr_resource_unused(dst);
-
-   if ((dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER)
-       || (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER)) {
-      util_resource_copy_region(
-         pipe, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box);
-      return;
-   }
-
-   debug_printf("unhandled swr_resource_copy\n");
-}
-
-
-static void
-swr_blit(struct pipe_context *pipe, const struct pipe_blit_info *blit_info)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   /* Make a copy of the const blit_info, so we can modify it */
-   struct pipe_blit_info info = *blit_info;
-
-   if (info.render_condition_enable && !swr_check_render_cond(pipe))
-      return;
-
-   if (info.src.resource->nr_samples > 1 && info.dst.resource->nr_samples <= 1
-       && !util_format_is_depth_or_stencil(info.src.resource->format)
-       && !util_format_is_pure_integer(info.src.resource->format)) {
-      debug_printf("swr_blit: color resolve : %d -> %d\n",
-            info.src.resource->nr_samples, info.dst.resource->nr_samples);
-
-      /* Resolve is done as part of the surface store. */
-      swr_store_dirty_resource(pipe, info.src.resource, SWR_TILE_RESOLVED);
-
-      struct pipe_resource *src_resource = info.src.resource;
-      struct pipe_resource *resolve_target =
-         swr_resource(src_resource)->resolve_target;
-
-      /* The resolve target becomes the new source for the blit. */
-      info.src.resource = resolve_target;
-   }
-
-   if (util_try_blit_via_copy_region(pipe, &info, ctx->render_cond_query != NULL)) {
-      return; /* done */
-   }
-
-   if (info.mask & PIPE_MASK_S) {
-      debug_printf("swr: cannot blit stencil, skipping\n");
-      info.mask &= ~PIPE_MASK_S;
-   }
-
-   if (!util_blitter_is_blit_supported(ctx->blitter, &info)) {
-      debug_printf("swr: blit unsupported %s -> %s\n",
-                   util_format_short_name(info.src.resource->format),
-                   util_format_short_name(info.dst.resource->format));
-      return;
-   }
-
-   if (ctx->active_queries) {
-      ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, FALSE);
-      ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, FALSE);
-   }
-
-   util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vertex_buffer);
-   util_blitter_save_vertex_elements(ctx->blitter, (void *)ctx->velems);
-   util_blitter_save_vertex_shader(ctx->blitter, (void *)ctx->vs);
-   util_blitter_save_geometry_shader(ctx->blitter, (void*)ctx->gs);
-   util_blitter_save_tessctrl_shader(ctx->blitter, (void*)ctx->tcs);
-   util_blitter_save_tesseval_shader(ctx->blitter, (void*)ctx->tes);
-   util_blitter_save_so_targets(
-      ctx->blitter,
-      ctx->num_so_targets,
-      (struct pipe_stream_output_target **)ctx->so_targets);
-   util_blitter_save_rasterizer(ctx->blitter, (void *)ctx->rasterizer);
-   util_blitter_save_viewport(ctx->blitter, &ctx->viewports[0]);
-   util_blitter_save_scissor(ctx->blitter, &ctx->scissors[0]);
-   util_blitter_save_fragment_shader(ctx->blitter, ctx->fs);
-   util_blitter_save_blend(ctx->blitter, (void *)ctx->blend);
-   util_blitter_save_depth_stencil_alpha(ctx->blitter,
-                                         (void *)ctx->depth_stencil);
-   util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref);
-   util_blitter_save_sample_mask(ctx->blitter, ctx->sample_mask, 0);
-   util_blitter_save_framebuffer(ctx->blitter, &ctx->framebuffer);
-   util_blitter_save_fragment_sampler_states(
-      ctx->blitter,
-      ctx->num_samplers[PIPE_SHADER_FRAGMENT],
-      (void **)ctx->samplers[PIPE_SHADER_FRAGMENT]);
-   util_blitter_save_fragment_sampler_views(
-      ctx->blitter,
-      ctx->num_sampler_views[PIPE_SHADER_FRAGMENT],
-      ctx->sampler_views[PIPE_SHADER_FRAGMENT]);
-   util_blitter_save_render_condition(ctx->blitter,
-                                      ctx->render_cond_query,
-                                      ctx->render_cond_cond,
-                                      ctx->render_cond_mode);
-
-   util_blitter_blit(ctx->blitter, &info);
-
-   if (ctx->active_queries) {
-      ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, TRUE);
-      ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, TRUE);
-   }
-}
-
-
-static void
-swr_destroy(struct pipe_context *pipe)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   struct swr_screen *screen = swr_screen(pipe->screen);
-
-   if (ctx->blitter)
-      util_blitter_destroy(ctx->blitter);
-
-   for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
-      if (ctx->framebuffer.cbufs[i]) {
-         struct swr_resource *res = swr_resource(ctx->framebuffer.cbufs[i]->texture);
-         /* NULL curr_pipe, so we don't have a reference to a deleted pipe */
-         res->curr_pipe = NULL;
-         pipe_surface_reference(&ctx->framebuffer.cbufs[i], NULL);
-      }
-   }
-
-   if (ctx->framebuffer.zsbuf) {
-      struct swr_resource *res = swr_resource(ctx->framebuffer.zsbuf->texture);
-      /* NULL curr_pipe, so we don't have a reference to a deleted pipe */
-      res->curr_pipe = NULL;
-      pipe_surface_reference(&ctx->framebuffer.zsbuf, NULL);
-   }
-
-   for (unsigned i = 0; i < ARRAY_SIZE(ctx->sampler_views[0]); i++) {
-      pipe_sampler_view_reference(&ctx->sampler_views[PIPE_SHADER_FRAGMENT][i], NULL);
-   }
-
-   for (unsigned i = 0; i < ARRAY_SIZE(ctx->sampler_views[0]); i++) {
-      pipe_sampler_view_reference(&ctx->sampler_views[PIPE_SHADER_VERTEX][i], NULL);
-   }
-
-   if (ctx->pipe.stream_uploader)
-      u_upload_destroy(ctx->pipe.stream_uploader);
-
-   /* Idle core after destroying buffer resources, but before deleting
-    * context.  Destroying resources has potentially called StoreTiles.*/
-   ctx->api.pfnSwrWaitForIdle(ctx->swrContext);
-
-   if (ctx->swrContext)
-      ctx->api.pfnSwrDestroyContext(ctx->swrContext);
-
-   delete ctx->blendJIT;
-
-   swr_destroy_scratch_buffers(ctx);
-
-
-   /* Only update screen->pipe if current context is being destroyed */
-   assert(screen);
-   if (screen->pipe == pipe)
-      screen->pipe = NULL;
-
-   AlignedFree(ctx);
-}
-
-
-static void
-swr_render_condition(struct pipe_context *pipe,
-                     struct pipe_query *query,
-                     bool condition,
-                     enum pipe_render_cond_flag mode)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   ctx->render_cond_query = query;
-   ctx->render_cond_mode = mode;
-   ctx->render_cond_cond = condition;
-}
-
-
-static void
-swr_flush_resource(struct pipe_context *ctx, struct pipe_resource *resource)
-{
-   // NOOP
-}
-
-static void
-swr_UpdateStats(HANDLE hPrivateContext, const SWR_STATS *pStats)
-{
-   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
-
-   if (!pDC)
-      return;
-
-   struct swr_query_result *pqr = pDC->pStats;
-
-   SWR_STATS *pSwrStats = &pqr->core;
-
-   pSwrStats->DepthPassCount += pStats->DepthPassCount;
-   pSwrStats->PsInvocations += pStats->PsInvocations;
-   pSwrStats->CsInvocations += pStats->CsInvocations;
-}
-
-static void
-swr_UpdateStatsFE(HANDLE hPrivateContext, const SWR_STATS_FE *pStats)
-{
-   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
-
-   if (!pDC)
-      return;
-
-   struct swr_query_result *pqr = pDC->pStats;
-
-   SWR_STATS_FE *pSwrStats = &pqr->coreFE;
-   p_atomic_add(&pSwrStats->IaVertices, pStats->IaVertices);
-   p_atomic_add(&pSwrStats->IaPrimitives, pStats->IaPrimitives);
-   p_atomic_add(&pSwrStats->VsInvocations, pStats->VsInvocations);
-   p_atomic_add(&pSwrStats->HsInvocations, pStats->HsInvocations);
-   p_atomic_add(&pSwrStats->DsInvocations, pStats->DsInvocations);
-   p_atomic_add(&pSwrStats->GsInvocations, pStats->GsInvocations);
-   p_atomic_add(&pSwrStats->CInvocations, pStats->CInvocations);
-   p_atomic_add(&pSwrStats->CPrimitives, pStats->CPrimitives);
-   p_atomic_add(&pSwrStats->GsPrimitives, pStats->GsPrimitives);
-
-   for (unsigned i = 0; i < 4; i++) {
-      p_atomic_add(&pSwrStats->SoPrimStorageNeeded[i],
-            pStats->SoPrimStorageNeeded[i]);
-      p_atomic_add(&pSwrStats->SoNumPrimsWritten[i],
-            pStats->SoNumPrimsWritten[i]);
-   }
-}
-
-static void
-swr_UpdateStreamOut(HANDLE hPrivateContext, uint64_t numPrims)
-{
-   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
-
-   if (!pDC)
-      return;
-
-   if (pDC->soPrims)
-       *pDC->soPrims += numPrims;
-}
-
-struct pipe_context *
-swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags)
-{
-   struct swr_context *ctx = (struct swr_context *)
-      AlignedMalloc(sizeof(struct swr_context), KNOB_SIMD_BYTES);
-   memset((void*)ctx, 0, sizeof(struct swr_context));
-
-   swr_screen(p_screen)->pfnSwrGetInterface(ctx->api);
-   swr_screen(p_screen)->pfnSwrGetTileInterface(ctx->tileApi);
-   ctx->swrDC.pAPI = &ctx->api;
-   ctx->swrDC.pTileAPI = &ctx->tileApi;
-
-   ctx->blendJIT =
-      new std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC>;
-
-   ctx->max_draws_in_flight = KNOB_MAX_DRAWS_IN_FLIGHT;
-
-   SWR_CREATECONTEXT_INFO createInfo {0};
-
-   createInfo.privateStateSize = sizeof(swr_draw_context);
-   createInfo.pfnLoadTile = swr_LoadHotTile;
-   createInfo.pfnStoreTile = swr_StoreHotTile;
-   createInfo.pfnUpdateStats = swr_UpdateStats;
-   createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE;
-   createInfo.pfnUpdateStreamOut = swr_UpdateStreamOut;
-   createInfo.pfnMakeGfxPtr = swr_MakeGfxPtr;
-
-   SWR_THREADING_INFO threadingInfo {0};
-
-   threadingInfo.MAX_WORKER_THREADS        = KNOB_MAX_WORKER_THREADS;
-   threadingInfo.MAX_NUMA_NODES            = KNOB_MAX_NUMA_NODES;
-   threadingInfo.MAX_CORES_PER_NUMA_NODE   = KNOB_MAX_CORES_PER_NUMA_NODE;
-   threadingInfo.MAX_THREADS_PER_CORE      = KNOB_MAX_THREADS_PER_CORE;
-   threadingInfo.SINGLE_THREADED           = KNOB_SINGLE_THREADED;
-
-   // Use non-standard settings for KNL
-   if (swr_screen(p_screen)->is_knl)
-   {
-      if (nullptr == getenv("KNOB_MAX_THREADS_PER_CORE"))
-         threadingInfo.MAX_THREADS_PER_CORE  = 2;
-
-      if (nullptr == getenv("KNOB_MAX_DRAWS_IN_FLIGHT"))
-      {
-         ctx->max_draws_in_flight = 2048;
-         createInfo.MAX_DRAWS_IN_FLIGHT = ctx->max_draws_in_flight;
-      }
-   }
-
-   createInfo.pThreadInfo = &threadingInfo;
-
-   ctx->swrContext = ctx->api.pfnSwrCreateContext(&createInfo);
-
-   ctx->api.pfnSwrInit();
-
-   if (ctx->swrContext == NULL)
-      goto fail;
-
-   ctx->pipe.screen = p_screen;
-   ctx->pipe.destroy = swr_destroy;
-   ctx->pipe.priv = priv;
-   ctx->pipe.create_surface = swr_create_surface;
-   ctx->pipe.surface_destroy = swr_surface_destroy;
-   ctx->pipe.buffer_map = swr_transfer_map;
-   ctx->pipe.buffer_unmap = swr_transfer_unmap;
-   ctx->pipe.texture_map = swr_transfer_map;
-   ctx->pipe.texture_unmap = swr_transfer_unmap;
-   ctx->pipe.transfer_flush_region = swr_transfer_flush_region;
-
-   ctx->pipe.buffer_subdata = u_default_buffer_subdata;
-   ctx->pipe.texture_subdata = u_default_texture_subdata;
-
-   ctx->pipe.clear_texture = util_clear_texture;
-   ctx->pipe.resource_copy_region = swr_resource_copy;
-   ctx->pipe.flush_resource = swr_flush_resource;
-   ctx->pipe.render_condition = swr_render_condition;
-
-   swr_state_init(&ctx->pipe);
-   swr_clear_init(&ctx->pipe);
-   swr_draw_init(&ctx->pipe);
-   swr_query_init(&ctx->pipe);
-
-   ctx->pipe.stream_uploader = u_upload_create_default(&ctx->pipe);
-   if (!ctx->pipe.stream_uploader)
-      goto fail;
-   ctx->pipe.const_uploader = ctx->pipe.stream_uploader;
-
-   ctx->pipe.blit = swr_blit;
-   ctx->blitter = util_blitter_create(&ctx->pipe);
-   if (!ctx->blitter)
-      goto fail;
-
-   swr_init_scratch_buffers(ctx);
-
-   return &ctx->pipe;
-
-fail:
-   /* Should really validate the init steps and fail gracefully */
-   swr_destroy(&ctx->pipe);
-   return NULL;
-}
diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h
deleted file mode 100644
index 11578764c23..00000000000
--- a/src/gallium/drivers/swr/swr_context.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#ifndef SWR_CONTEXT_H
-#define SWR_CONTEXT_H
-
-#include "common/os.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_state.h"
-#include "util/u_blitter.h"
-#include "rasterizer/memory/SurfaceState.h"
-#include "rasterizer/memory/InitMemory.h"
-#include "jit_api.h"
-#include "swr_state.h"
-#include <unordered_map>
-
-#define SWR_NEW_BLEND (1 << 0)
-#define SWR_NEW_RASTERIZER (1 << 1)
-#define SWR_NEW_DEPTH_STENCIL_ALPHA (1 << 2)
-#define SWR_NEW_SAMPLER (1 << 3)
-#define SWR_NEW_SAMPLER_VIEW (1 << 4)
-#define SWR_NEW_VS (1 << 5)
-#define SWR_NEW_FS (1 << 6)
-#define SWR_NEW_GS (1 << 7)
-#define SWR_NEW_VSCONSTANTS (1 << 8)
-#define SWR_NEW_FSCONSTANTS (1 << 9)
-#define SWR_NEW_GSCONSTANTS (1 << 10)
-#define SWR_NEW_VERTEX (1 << 11)
-#define SWR_NEW_STIPPLE (1 << 12)
-#define SWR_NEW_SCISSOR (1 << 13)
-#define SWR_NEW_VIEWPORT (1 << 14)
-#define SWR_NEW_FRAMEBUFFER (1 << 15)
-#define SWR_NEW_CLIP (1 << 16)
-#define SWR_NEW_SO (1 << 17)
-#define SWR_BLOCK_CLIENT_DRAW ( 1 << 18) // Indicates client draw will block
-#define SWR_NEW_TCS (1 << 19)
-#define SWR_NEW_TES (1 << 20)
-#define SWR_NEW_TS (1 << 21)
-#define SWR_NEW_TCSCONSTANTS (1 << 22)
-#define SWR_NEW_TESCONSTANTS (1 << 23)
-
-namespace std
-{
-template <> struct hash<BLEND_COMPILE_STATE> {
-   std::size_t operator()(const BLEND_COMPILE_STATE &k) const
-   {
-      return util_hash_crc32(&k, sizeof(k));
-   }
-};
-};
-
-struct swr_jit_texture {
-   uint32_t width; // same as number of elements
-   uint32_t height;
-   uint32_t depth; // doubles as array size
-   uint32_t first_level;
-   uint32_t last_level;
-   const uint8_t *base_ptr;
-   uint32_t num_samples;
-   uint32_t sample_stride;
-   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
-   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
-   uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS];
-};
-
-struct swr_jit_sampler {
-   float min_lod;
-   float max_lod;
-   float lod_bias;
-   float border_color[4];
-};
-
-struct swr_draw_context {
-   const float *constantVS[PIPE_MAX_CONSTANT_BUFFERS];
-   uint32_t num_constantsVS[PIPE_MAX_CONSTANT_BUFFERS];
-   const float *constantFS[PIPE_MAX_CONSTANT_BUFFERS];
-   uint32_t num_constantsFS[PIPE_MAX_CONSTANT_BUFFERS];
-   const float *constantGS[PIPE_MAX_CONSTANT_BUFFERS];
-   uint32_t num_constantsGS[PIPE_MAX_CONSTANT_BUFFERS];
-   const float *constantTCS[PIPE_MAX_CONSTANT_BUFFERS];
-   uint32_t num_constantsTCS[PIPE_MAX_CONSTANT_BUFFERS];
-   const float *constantTES[PIPE_MAX_CONSTANT_BUFFERS];
-   uint32_t num_constantsTES[PIPE_MAX_CONSTANT_BUFFERS];
-
-   swr_jit_texture texturesVS[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   swr_jit_sampler samplersVS[PIPE_MAX_SAMPLERS];
-   swr_jit_texture texturesFS[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   swr_jit_sampler samplersFS[PIPE_MAX_SAMPLERS];
-   swr_jit_texture texturesGS[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   swr_jit_sampler samplersGS[PIPE_MAX_SAMPLERS];
-   swr_jit_texture texturesTCS[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   swr_jit_sampler samplersTCS[PIPE_MAX_SAMPLERS];
-   swr_jit_texture texturesTES[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   swr_jit_sampler samplersTES[PIPE_MAX_SAMPLERS];
-
-   float userClipPlanes[PIPE_MAX_CLIP_PLANES][4];
-
-   uint32_t polyStipple[32];
-
-   SWR_SURFACE_STATE renderTargets[SWR_NUM_ATTACHMENTS];
-   struct swr_query_result *pStats; // @llvm_struct
-   SWR_INTERFACE *pAPI; // @llvm_struct - Needed for the swr_memory callbacks
-   SWR_TILE_INTERFACE *pTileAPI; // @llvm_struct - Needed for the swr_memory callbacks
-
-   uint64_t* soPrims; //number of primitives written to StreamOut buffer
-};
-
-/* gen_llvm_types FINI */
-
-struct swr_context {
-   struct pipe_context pipe; /**< base class */
-
-   HANDLE swrContext;
-
-   SWR_TS_STATE tsState;
-
-   /** Constant state objects */
-   struct swr_blend_state *blend;
-   struct pipe_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
-   struct pipe_depth_stencil_alpha_state *depth_stencil;
-   struct pipe_rasterizer_state *rasterizer;
-
-   struct swr_vertex_shader *vs;
-   struct swr_fragment_shader *fs;
-   struct swr_geometry_shader *gs;
-   struct swr_tess_control_shader *tcs;
-   struct swr_tess_evaluation_shader *tes;
-   struct swr_vertex_element_state *velems;
-
-   /** Other rendering state */
-   struct pipe_blend_color blend_color;
-   struct pipe_stencil_ref stencil_ref;
-   struct pipe_clip_state clip;
-   struct pipe_constant_buffer
-      constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
-   struct pipe_framebuffer_state framebuffer;
-   struct swr_poly_stipple poly_stipple;
-   struct pipe_scissor_state scissors[KNOB_NUM_VIEWPORTS_SCISSORS];
-   SWR_RECT swr_scissors[KNOB_NUM_VIEWPORTS_SCISSORS];
-   struct pipe_sampler_view *
-      sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
-
-   struct pipe_viewport_state viewports[KNOB_NUM_VIEWPORTS_SCISSORS];
-   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
-
-   struct blitter_context *blitter;
-
-   /** Conditional query object and mode */
-   struct pipe_query *render_cond_query;
-   enum pipe_render_cond_flag render_cond_mode;
-   bool render_cond_cond;
-   unsigned active_queries;
-
-   unsigned num_vertex_buffers;
-   unsigned num_samplers[PIPE_SHADER_TYPES];
-   unsigned num_sampler_views[PIPE_SHADER_TYPES];
-
-   unsigned sample_mask;
-
-   // streamout
-   pipe_stream_output_target *so_targets[MAX_SO_STREAMS];
-   uint32_t num_so_targets;
-   uint64_t so_primCounter; // number of primitives written to StreamOut buffer
-
-   /* Temp storage for user_buffer constants */
-   struct swr_scratch_buffers *scratch;
-
-   // blend jit functions
-   std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC> *blendJIT;
-
-   /* Derived SWR API DrawState */
-   struct swr_derived_state derived;
-
-   /* SWR private state - draw context */
-   struct swr_draw_context swrDC;
-
-   unsigned dirty; /**< Mask of SWR_NEW_x flags */
-
-   SWR_INTERFACE api;
-   SWR_TILE_INTERFACE tileApi;
-
-   uint32_t max_draws_in_flight;
-   uint8_t patch_vertices;
-};
-
-static INLINE struct swr_context *
-swr_context(struct pipe_context *pipe)
-{
-   return (struct swr_context *)pipe;
-}
-
-static INLINE void
-swr_update_draw_context(struct swr_context *ctx,
-      struct swr_query_result *pqr = nullptr)
-{
-   swr_draw_context *pDC =
-      (swr_draw_context *)ctx->api.pfnSwrGetPrivateContextState(ctx->swrContext);
-   if (pqr)
-      ctx->swrDC.pStats = pqr;
-   memcpy(pDC, &ctx->swrDC, sizeof(swr_draw_context));
-}
-
-struct pipe_context *swr_create_context(struct pipe_screen *, void *priv, unsigned flags);
-
-void swr_state_init(struct pipe_context *pipe);
-
-void swr_clear_init(struct pipe_context *pipe);
-
-void swr_draw_init(struct pipe_context *pipe);
-
-void swr_finish(struct pipe_context *pipe);
-
-void swr_do_msaa_resolve(struct pipe_resource *src_resource,
-                         struct pipe_resource *dst_resource);
-#endif
diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp
deleted file mode 100644
index 4b42a8e0390..00000000000
--- a/src/gallium/drivers/swr/swr_draw.cpp
+++ /dev/null
@@ -1,399 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include "swr_screen.h"
-#include "swr_context.h"
-#include "swr_resource.h"
-#include "swr_fence.h"
-#include "swr_query.h"
-#include "jit_api.h"
-
-#include "util/u_draw.h"
-#include "util/u_prim.h"
-
-#include <algorithm>
-#include <iostream>
-/*
- * Draw vertex arrays, with optional indexing, optional instancing.
- */
-static void
-swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
-             unsigned drawid_offset,
-             const struct pipe_draw_indirect_info *indirect,
-             const struct pipe_draw_start_count_bias *draws,
-             unsigned num_draws)
-{
-   if (num_draws > 1) {
-      struct pipe_draw_info tmp_info = *info;
-      unsigned drawid = drawid_offset;
-
-      for (unsigned i = 0; i < num_draws; i++) {
-         swr_draw_vbo(pipe, &tmp_info, drawid, indirect, &draws[i], 1);
-         if (tmp_info.increment_draw_id)
-            drawid++;
-      }
-      return;
-   }
-
-   if (!indirect && (!draws[0].count || !info->instance_count))
-      return;
-
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (!indirect &&
-       !info->primitive_restart &&
-       !u_trim_pipe_prim(info->mode, (unsigned*)&draws[0].count))
-      return;
-
-   if (!swr_check_render_cond(pipe))
-      return;
-
-   if (indirect && indirect->buffer) {
-      util_draw_indirect(pipe, info, indirect);
-      return;
-   }
-
-   /* If indexed draw, force vertex validation since index buffer comes
-    * from draw info. */
-   if (info->index_size)
-      ctx->dirty |= SWR_NEW_VERTEX;
-
-   /* Update derived state, pass draw info to update function. */
-   swr_update_derived(pipe, info, draws);
-
-   swr_update_draw_context(ctx);
-
-   struct pipe_draw_info resolved_info;
-   struct pipe_draw_start_count_bias resolved_draw;
-   /* DrawTransformFeedback */
-   if (indirect && indirect->count_from_stream_output) {
-      // trick copied from softpipe to modify const struct *info
-      memcpy(&resolved_info, (void*)info, sizeof(struct pipe_draw_info));
-      resolved_draw.start = draws[0].start;
-      resolved_draw.count = ctx->so_primCounter * ctx->patch_vertices;
-      resolved_info.max_index = resolved_draw.count - 1;
-      info = &resolved_info;
-      indirect = NULL;
-      draws = &resolved_draw;
-   }
-
-   if (ctx->vs->pipe.stream_output.num_outputs) {
-      if (!ctx->vs->soFunc[info->mode]) {
-         STREAMOUT_COMPILE_STATE state = {0};
-         struct pipe_stream_output_info *so = &ctx->vs->pipe.stream_output;
-
-         state.numVertsPerPrim = u_vertices_per_prim(info->mode);
-
-         uint32_t offsets[MAX_SO_STREAMS] = {0};
-         uint32_t num = 0;
-
-         for (uint32_t i = 0; i < so->num_outputs; i++) {
-            assert(so->output[i].stream == 0); // @todo
-            uint32_t output_buffer = so->output[i].output_buffer;
-            if (so->output[i].dst_offset != offsets[output_buffer]) {
-               // hole - need to fill
-               state.stream.decl[num].bufferIndex = output_buffer;
-               state.stream.decl[num].hole = true;
-               state.stream.decl[num].componentMask =
-                  (1 << (so->output[i].dst_offset - offsets[output_buffer]))
-                  - 1;
-               num++;
-               offsets[output_buffer] = so->output[i].dst_offset;
-            }
-
-            unsigned attrib_slot = so->output[i].register_index;
-            attrib_slot = swr_so_adjust_attrib(attrib_slot, ctx->vs);
-
-            state.stream.decl[num].bufferIndex = output_buffer;
-            state.stream.decl[num].attribSlot = attrib_slot;
-            state.stream.decl[num].componentMask =
-               ((1 << so->output[i].num_components) - 1)
-               << so->output[i].start_component;
-            state.stream.decl[num].hole = false;
-            num++;
-
-            offsets[output_buffer] += so->output[i].num_components;
-         }
-
-         state.stream.numDecls = num;
-
-         HANDLE hJitMgr = swr_screen(pipe->screen)->hJitMgr;
-         ctx->vs->soFunc[info->mode] = JitCompileStreamout(hJitMgr, state);
-         debug_printf("so shader    %p\n", ctx->vs->soFunc[info->mode]);
-         assert(ctx->vs->soFunc[info->mode] && "Error: SoShader = NULL");
-      }
-
-      ctx->api.pfnSwrSetSoFunc(ctx->swrContext, ctx->vs->soFunc[info->mode], 0);
-   }
-
-   struct swr_vertex_element_state *velems = ctx->velems;
-   if (info->primitive_restart)
-      velems->fsState.cutIndex = info->restart_index;
-   else
-      velems->fsState.cutIndex = 0;
-   velems->fsState.bEnableCutIndex = info->primitive_restart;
-   velems->fsState.bPartialVertexBuffer = (info->index_bounds_valid && info->min_index > 0);
-
-   swr_jit_fetch_key key;
-   swr_generate_fetch_key(key, velems);
-   auto search = velems->map.find(key);
-   if (search != velems->map.end()) {
-      velems->fsFunc = search->second;
-   } else {
-      HANDLE hJitMgr = swr_screen(ctx->pipe.screen)->hJitMgr;
-      velems->fsFunc = JitCompileFetch(hJitMgr, velems->fsState);
-
-      debug_printf("fetch shader %p\n", velems->fsFunc);
-      assert(velems->fsFunc && "Error: FetchShader = NULL");
-
-      velems->map.insert(std::make_pair(key, velems->fsFunc));
-   }
-
-   ctx->api.pfnSwrSetFetchFunc(ctx->swrContext, velems->fsFunc);
-
-   /* Set up frontend state
-    * XXX setup provokingVertex & topologyProvokingVertex */
-   SWR_FRONTEND_STATE feState = {0};
-
-   // feState.vsVertexSize seeds the PA size that is used as an interface
-   // between all the shader stages, so it has to be large enough to
-   // incorporate all interfaces between stages
-
-   // max of frontend shaders num_outputs
-   feState.vsVertexSize = ctx->vs->info.base.num_outputs;
-   if (ctx->gs) {
-      feState.vsVertexSize = std::max(feState.vsVertexSize, (uint32_t)ctx->gs->info.base.num_outputs);
-   }
-   if (ctx->tcs) {
-      feState.vsVertexSize = std::max(feState.vsVertexSize, (uint32_t)ctx->tcs->info.base.num_outputs);
-   }
-   if (ctx->tes) {
-      feState.vsVertexSize = std::max(feState.vsVertexSize, (uint32_t)ctx->tes->info.base.num_outputs);
-   }
-
-
-   if (ctx->vs->info.base.num_outputs) {
-      // gs does not adjust for position in SGV slot at input from vs
-      if (!ctx->gs && !ctx->tcs && !ctx->tes)
-         feState.vsVertexSize--;
-   }
-
-   // other (non-SGV) slots start at VERTEX_ATTRIB_START_SLOT
-   feState.vsVertexSize += VERTEX_ATTRIB_START_SLOT;
-
-   // The PA in the clipper does not handle BE vertex sizes
-   // different from FE. Increase vertexsize only for the cases that needed it
-
-   // primid needs a slot
-   if (ctx->fs->info.base.uses_primid)
-      feState.vsVertexSize++;
-   // sprite coord enable
-   if (ctx->rasterizer->sprite_coord_enable)
-      feState.vsVertexSize++;
-
-   if (ctx->rasterizer->flatshade_first) {
-      feState.provokingVertex = {1, 0, 0};
-   } else {
-      feState.provokingVertex = {2, 1, 2};
-   }
-
-   enum pipe_prim_type topology;
-   if (ctx->gs)
-      topology = (pipe_prim_type)ctx->gs->info.base.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
-   else
-      topology = info->mode;
-
-   switch (topology) {
-   case PIPE_PRIM_TRIANGLE_FAN:
-      feState.topologyProvokingVertex = feState.provokingVertex.triFan;
-      break;
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_TRIANGLES:
-      feState.topologyProvokingVertex = feState.provokingVertex.triStripList;
-      break;
-   case PIPE_PRIM_QUAD_STRIP:
-   case PIPE_PRIM_QUADS:
-      if (ctx->rasterizer->flatshade_first)
-         feState.topologyProvokingVertex = 0;
-      else
-         feState.topologyProvokingVertex = 3;
-      break;
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_LINE_LOOP:
-   case PIPE_PRIM_LINE_STRIP:
-      feState.topologyProvokingVertex = feState.provokingVertex.lineStripList;
-      break;
-   default:
-      feState.topologyProvokingVertex = 0;
-   }
-
-   feState.bEnableCutIndex = info->primitive_restart;
-   ctx->api.pfnSwrSetFrontendState(ctx->swrContext, &feState);
-
-   if (info->index_size)
-      ctx->api.pfnSwrDrawIndexedInstanced(ctx->swrContext,
-                                          swr_convert_prim_topology(info->mode, ctx->patch_vertices),
-                                          draws[0].count,
-                                          info->instance_count,
-                                          draws[0].start,
-                                          draws->index_bias,
-                                          info->start_instance);
-   else
-      ctx->api.pfnSwrDrawInstanced(ctx->swrContext,
-                                   swr_convert_prim_topology(info->mode, ctx->patch_vertices),
-                                   draws[0].count,
-                                   info->instance_count,
-                                   draws[0].start,
-                                   info->start_instance);
-
-   /* On client-buffer draw, we used client buffer directly, without
-    * copy.  Block until draw is finished.
-    * VMD is an example application that benefits from this. */
-   if (ctx->dirty & SWR_BLOCK_CLIENT_DRAW) {
-      struct swr_screen *screen = swr_screen(pipe->screen);
-      swr_fence_submit(ctx, screen->flush_fence);
-      swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0);
-   }
-}
-
-
-static void
-swr_flush(struct pipe_context *pipe,
-          struct pipe_fence_handle **fence,
-          unsigned flags)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   struct swr_screen *screen = swr_screen(pipe->screen);
-
-   for (int i=0; i < ctx->framebuffer.nr_cbufs; i++) {
-      struct pipe_surface *cb = ctx->framebuffer.cbufs[i];
-      if (cb) {
-         swr_store_dirty_resource(pipe, cb->texture, SWR_TILE_RESOLVED);
-      }
-   }
-   if (ctx->framebuffer.zsbuf) {
-      swr_store_dirty_resource(pipe, ctx->framebuffer.zsbuf->texture,
-                               SWR_TILE_RESOLVED);
-   }
-
-   if (fence)
-      swr_fence_reference(pipe->screen, fence, screen->flush_fence);
-}
-
-void
-swr_finish(struct pipe_context *pipe)
-{
-   struct pipe_fence_handle *fence = nullptr;
-
-   swr_flush(pipe, &fence, 0);
-   swr_fence_finish(pipe->screen, NULL, fence, 0);
-   swr_fence_reference(pipe->screen, &fence, NULL);
-}
-
-/*
- * Invalidate tiles so they can be reloaded back when needed
- */
-void
-swr_invalidate_render_target(struct pipe_context *pipe,
-                             uint32_t attachment,
-                             uint16_t width, uint16_t height)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   /* grab the rect from the passed in arguments */
-   swr_update_draw_context(ctx);
-   SWR_RECT full_rect =
-      {0, 0, (int32_t)width, (int32_t)height};
-   ctx->api.pfnSwrInvalidateTiles(ctx->swrContext,
-                                  1 << attachment,
-                                  full_rect);
-}
-
-
-/*
- * Store SWR HotTiles back to renderTarget surface.
- */
-void
-swr_store_render_target(struct pipe_context *pipe,
-                        uint32_t attachment,
-                        enum SWR_TILE_STATE post_tile_state)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   struct swr_draw_context *pDC = &ctx->swrDC;
-   struct SWR_SURFACE_STATE *renderTarget = &pDC->renderTargets[attachment];
-
-   /* Only proceed if there's a valid surface to store to */
-   if (renderTarget->xpBaseAddress) {
-      swr_update_draw_context(ctx);
-      SWR_RECT full_rect =
-         {0, 0,
-          (int32_t)u_minify(renderTarget->width, renderTarget->lod),
-          (int32_t)u_minify(renderTarget->height, renderTarget->lod)};
-      ctx->api.pfnSwrStoreTiles(ctx->swrContext,
-                                1 << attachment,
-                                post_tile_state,
-                                full_rect);
-   }
-}
-
-void
-swr_store_dirty_resource(struct pipe_context *pipe,
-                         struct pipe_resource *resource,
-                         enum SWR_TILE_STATE post_tile_state)
-{
-   /* Only store resource if it has been written to */
-   if (swr_resource(resource)->status & SWR_RESOURCE_WRITE) {
-      struct swr_context *ctx = swr_context(pipe);
-      struct swr_screen *screen = swr_screen(pipe->screen);
-      struct swr_resource *spr = swr_resource(resource);
-
-      swr_draw_context *pDC = &ctx->swrDC;
-      SWR_SURFACE_STATE *renderTargets = pDC->renderTargets;
-      for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; i++)
-         if (renderTargets[i].xpBaseAddress == spr->swr.xpBaseAddress ||
-             (spr->secondary.xpBaseAddress &&
-              renderTargets[i].xpBaseAddress == spr->secondary.xpBaseAddress)) {
-            swr_store_render_target(pipe, i, post_tile_state);
-
-            /* Mesa thinks depth/stencil are fused, so we'll never get an
-             * explicit resource for stencil.  So, if checking depth, then
-             * also check for stencil. */
-            if (spr->has_stencil && (i == SWR_ATTACHMENT_DEPTH)) {
-               swr_store_render_target(
-                  pipe, SWR_ATTACHMENT_STENCIL, post_tile_state);
-            }
-
-            /* This fence signals StoreTiles completion */
-            swr_fence_submit(ctx, screen->flush_fence);
-
-            break;
-         }
-   }
-}
-
-void
-swr_draw_init(struct pipe_context *pipe)
-{
-   pipe->draw_vbo = swr_draw_vbo;
-   pipe->flush = swr_flush;
-}
diff --git a/src/gallium/drivers/swr/swr_fence.cpp b/src/gallium/drivers/swr/swr_fence.cpp
deleted file mode 100644
index 4e2b2af874c..00000000000
--- a/src/gallium/drivers/swr/swr_fence.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include "pipe/p_screen.h"
-#include "util/u_memory.h"
-#include "util/os_time.h"
-
-#include "swr_context.h"
-#include "swr_screen.h"
-#include "swr_fence.h"
-
-#ifdef __APPLE__
-#include <sched.h>
-#endif
-
-#if defined(PIPE_CC_MSVC) // portable thread yield
-   #define sched_yield SwitchToThread
-#endif
-
-/*
- * Fence callback, called by back-end thread on completion of all rendering up
- * to SwrSync call.
- */
-static void
-swr_fence_cb(uint64_t userData, uint64_t userData2, uint64_t userData3)
-{
-   struct swr_fence *fence = (struct swr_fence *)userData;
-
-   /* Complete all work attached to the fence */
-   swr_fence_do_work(fence);
-
-   /* Correct value is in SwrSync data, and not the fence write field. */
-   /* Contexts may not finish in order, but fence value always increases */
-   if (fence->read < userData2)
-      fence->read = userData2;
-}
-
-/*
- * Submit an existing fence.
- */
-void
-swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fh)
-{
-   struct swr_fence *fence = swr_fence(fh);
-
-   fence->write++;
-   fence->pending = TRUE;
-   ctx->api.pfnSwrSync(ctx->swrContext, swr_fence_cb, (uint64_t)fence, fence->write, 0);
-}
-
-/*
- * Create a new fence object.
- */
-struct pipe_fence_handle *
-swr_fence_create()
-{
-   static int fence_id = 0;
-   struct swr_fence *fence = CALLOC_STRUCT(swr_fence);
-   if (!fence)
-      return NULL;
-
-   pipe_reference_init(&fence->reference, 1);
-   fence->id = fence_id++;
-   fence->work.tail = &fence->work.head;
-
-   return (struct pipe_fence_handle *)fence;
-}
-
-/** Destroy a fence.  Called when refcount hits zero. */
-static void
-swr_fence_destroy(struct swr_fence *fence)
-{
-   /* Complete any work left if fence was not submitted */
-   swr_fence_do_work(fence);
-   FREE(fence);
-}
-
-/**
- * Set ptr = fence, with reference counting
- */
-void
-swr_fence_reference(struct pipe_screen *screen,
-                    struct pipe_fence_handle **ptr,
-                    struct pipe_fence_handle *f)
-{
-   struct swr_fence *fence = swr_fence(f);
-   struct swr_fence *old;
-
-   if (likely(ptr)) {
-      old = swr_fence(*ptr);
-      *ptr = f;
-   } else {
-      old = NULL;
-   }
-
-   if (pipe_reference(&old->reference, &fence->reference)) {
-      swr_fence_finish(screen, NULL, (struct pipe_fence_handle *) old, 0);
-      swr_fence_destroy(old);
-   }
-}
-
-
-/*
- * Wait for the fence to finish.
- */
-bool
-swr_fence_finish(struct pipe_screen *screen,
-                 struct pipe_context *ctx,
-                 struct pipe_fence_handle *fence_handle,
-                 uint64_t timeout)
-{
-   while (!swr_is_fence_done(fence_handle))
-      sched_yield();
-
-   swr_fence(fence_handle)->pending = FALSE;
-
-   return TRUE;
-}
-
-
-uint64_t
-swr_get_timestamp(struct pipe_screen *screen)
-{
-   return os_time_get_nano();
-}
-
-
-void
-swr_fence_init(struct pipe_screen *p_screen)
-{
-   p_screen->fence_reference = swr_fence_reference;
-   p_screen->fence_finish = swr_fence_finish;
-   p_screen->get_timestamp = swr_get_timestamp;
-
-   /* Create persistant StoreTiles "flush" fence, used to signal completion
-    * of flushing tile state back to resource texture, via StoreTiles. */
-   struct swr_screen *screen = swr_screen(p_screen);
-   screen->flush_fence = swr_fence_create();
-}
diff --git a/src/gallium/drivers/swr/swr_fence.h b/src/gallium/drivers/swr/swr_fence.h
deleted file mode 100644
index 2f7cd1cf9a6..00000000000
--- a/src/gallium/drivers/swr/swr_fence.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ***************************************************************************/
-
-#ifndef SWR_FENCE_H
-#define SWR_FENCE_H
-
-#include "pipe/p_state.h"
-#include "util/u_inlines.h"
-
-#include "swr_fence_work.h"
-
-struct pipe_screen;
-
-struct swr_fence {
-   struct pipe_reference reference;
-
-   uint64_t read;
-   uint64_t write;
-
-   unsigned pending;
-
-   unsigned id; /* Just for reference */
-   
-   struct {
-      uint32_t count;
-      struct swr_fence_work head;
-      struct swr_fence_work *tail;
-   } work;
-};
-
-
-static inline struct swr_fence *
-swr_fence(struct pipe_fence_handle *fence)
-{
-   return (struct swr_fence *)fence;
-}
-
-
-static INLINE bool
-swr_is_fence_done(struct pipe_fence_handle *fence_handle)
-{
-   struct swr_fence *fence = swr_fence(fence_handle);
-   return (fence->read == fence->write);
-}
-
-static INLINE bool
-swr_is_fence_pending(struct pipe_fence_handle *fence_handle)
-{
-   return swr_fence(fence_handle)->pending;
-}
-
-
-void swr_fence_init(struct pipe_screen *screen);
-
-struct pipe_fence_handle *swr_fence_create();
-
-void swr_fence_reference(struct pipe_screen *screen,
-                         struct pipe_fence_handle **ptr,
-                         struct pipe_fence_handle *f);
-
-bool swr_fence_finish(struct pipe_screen *screen,
-                      struct pipe_context *ctx,
-                      struct pipe_fence_handle *fence_handle,
-                      uint64_t timeout);
-
-void
-swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fence);
-
-uint64_t swr_get_timestamp(struct pipe_screen *screen);
-
-#endif
diff --git a/src/gallium/drivers/swr/swr_fence_work.cpp b/src/gallium/drivers/swr/swr_fence_work.cpp
deleted file mode 100644
index 6df55666a36..00000000000
--- a/src/gallium/drivers/swr/swr_fence_work.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include "swr_context.h"
-#include "swr_fence.h"
-
-#include "util/u_inlines.h"
-#include "util/u_memory.h"
-
-/*
- * Called by swr_fence_cb to complete the work queue
- */
-void
-swr_fence_do_work(struct swr_fence *fence)
-{
-   struct swr_fence_work *work, *tmp;
-
-   if (fence->work.head.next) {
-      work = fence->work.head.next;
-      /* Immediately clear the head so any new work gets added to a new work
-       * queue */
-      p_atomic_set(&fence->work.head.next, 0);
-      p_atomic_set(&fence->work.tail, &fence->work.head);
-      p_atomic_set(&fence->work.count, 0);
-
-      do {
-         tmp = work->next;
-         work->callback(work);
-         FREE(work);
-         work = tmp;
-      } while(work);
-   }
-}
-
-
-/*
- * Called by one of the specialized work routines below
- */
-static inline void
-swr_add_fence_work(struct pipe_fence_handle *fh,
-                   struct swr_fence_work *work)
-{
-   /* If no fence, just do the work now */
-   if (!fh) {
-      work->callback(work);
-      FREE(work);
-      return;
-   }
-
-   struct swr_fence *fence  = swr_fence(fh);
-   p_atomic_set(&fence->work.tail->next, work);
-   p_atomic_set(&fence->work.tail, work);
-   p_atomic_inc(&fence->work.count);
-}
-
-
-/*
- * Generic free/free_aligned, and delete vs/fs
- */
-template<bool aligned_free>
-static void
-swr_free_cb(struct swr_fence_work *work)
-{
-   if (aligned_free)
-      AlignedFree(work->free.data);
-   else
-      FREE(work->free.data);
-}
-
-static void
-swr_delete_vs_cb(struct swr_fence_work *work)
-{
-   delete work->free.swr_vs;
-}
-
-static void
-swr_delete_fs_cb(struct swr_fence_work *work)
-{
-   delete work->free.swr_fs;
-}
-
-static void
-swr_delete_gs_cb(struct swr_fence_work *work)
-{
-   delete work->free.swr_gs;
-}
-
-static void
-swr_delete_tcs_cb(struct swr_fence_work *work)
-{
-   delete work->free.swr_tcs;
-}
-
-static void
-swr_delete_tes_cb(struct swr_fence_work *work)
-{
-   delete work->free.swr_tes;
-}
-
-
-bool
-swr_fence_work_free(struct pipe_fence_handle *fence, void *data,
-                    bool aligned_free)
-{
-   struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work);
-   if (!work)
-      return false;
-   if (aligned_free)
-      work->callback = swr_free_cb<true>;
-   else
-      work->callback = swr_free_cb<false>;
-   work->free.data = data;
-
-   swr_add_fence_work(fence, work);
-
-   return true;
-}
-
-bool
-swr_fence_work_delete_vs(struct pipe_fence_handle *fence,
-                         struct swr_vertex_shader *swr_vs)
-{
-   struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work);
-   if (!work)
-      return false;
-   work->callback = swr_delete_vs_cb;
-   work->free.swr_vs = swr_vs;
-
-   swr_add_fence_work(fence, work);
-
-   return true;
-}
-
-bool
-swr_fence_work_delete_fs(struct pipe_fence_handle *fence,
-                         struct swr_fragment_shader *swr_fs)
-{
-   struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work);
-   if (!work)
-      return false;
-   work->callback = swr_delete_fs_cb;
-   work->free.swr_fs = swr_fs;
-
-   swr_add_fence_work(fence, work);
-
-   return true;
-}
-
-bool
-swr_fence_work_delete_gs(struct pipe_fence_handle *fence,
-                         struct swr_geometry_shader *swr_gs)
-{
-   struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work);
-   if (!work)
-      return false;
-   work->callback = swr_delete_gs_cb;
-   work->free.swr_gs = swr_gs;
-
-   swr_add_fence_work(fence, work);
-
-   return true;
-}
-
-bool
-swr_fence_work_delete_tcs(struct pipe_fence_handle *fence,
-                          struct swr_tess_control_shader *swr_tcs)
-{
-   struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work);
-   if (!work)
-      return false;
-   work->callback = swr_delete_tcs_cb;
-   work->free.swr_tcs = swr_tcs;
-
-   swr_add_fence_work(fence, work);
-
-   return true;
-}
-
-
-bool
-swr_fence_work_delete_tes(struct pipe_fence_handle *fence,
-                          struct swr_tess_evaluation_shader *swr_tes)
-{
-   struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work);
-   if (!work)
-      return false;
-   work->callback = swr_delete_tes_cb;
-   work->free.swr_tes = swr_tes;
-
-   swr_add_fence_work(fence, work);
-
-   return true;
-}
-\ No newline at end of file
diff --git a/src/gallium/drivers/swr/swr_fence_work.h b/src/gallium/drivers/swr/swr_fence_work.h
deleted file mode 100644
index ab411599ca5..00000000000
--- a/src/gallium/drivers/swr/swr_fence_work.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ***************************************************************************/
-
-#ifndef SWR_FENCE_WORK_H
-#define SWR_FENCE_WORK_H
-
-typedef void(*SWR_WORK_CALLBACK_FUNC)(struct swr_fence_work *work);
-
-struct swr_fence_work {
-   SWR_WORK_CALLBACK_FUNC callback;
-
-   union {
-      void *data;
-      struct swr_vertex_shader *swr_vs;
-      struct swr_fragment_shader *swr_fs;
-      struct swr_geometry_shader *swr_gs;
-      struct swr_tess_control_shader *swr_tcs;
-      struct swr_tess_evaluation_shader *swr_tes;
-   } free;
-
-   struct swr_fence_work *next;
-};
-
-void swr_fence_do_work(struct swr_fence *fence);
-
-bool swr_fence_work_free(struct pipe_fence_handle *fence, void *data,
-                         bool aligned_free = false);
-bool swr_fence_work_delete_vs(struct pipe_fence_handle *fence,
-                              struct swr_vertex_shader *swr_vs);
-bool swr_fence_work_delete_fs(struct pipe_fence_handle *fence,
-                              struct swr_fragment_shader *swr_vs);
-bool swr_fence_work_delete_gs(struct pipe_fence_handle *fence,
-                              struct swr_geometry_shader *swr_gs);
-bool swr_fence_work_delete_tcs(struct pipe_fence_handle *fence,
-                               struct swr_tess_control_shader *swr_tcs);
-bool swr_fence_work_delete_tes(struct pipe_fence_handle *fence,
-                               struct swr_tess_evaluation_shader *swr_tes);
-#endif
diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp
deleted file mode 100644
index 1fb14e636d7..00000000000
--- a/src/gallium/drivers/swr/swr_loader.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include "memory/InitMemory.h"
-#include "util/u_cpu_detect.h"
-#include "util/u_dl.h"
-#include "swr_public.h"
-#include "swr_screen.h"
-
-#include <stdio.h>
-
-// Helper function to resolve the backend filename based on architecture
-static bool
-swr_initialize_screen_interface(struct swr_screen *screen, const char arch[])
-{
-#ifdef HAVE_SWR_BUILTIN
-   screen->pLibrary = NULL;
-   screen->pfnSwrGetInterface = SwrGetInterface;
-   screen->pfnSwrGetTileInterface = SwrGetTileIterface;
-   InitTilesTable();
-   swr_print_info("(using: builtin).\n");
-#else
-   char filename[256] = { 0 };
-   sprintf(filename, "%sswr%s%s", UTIL_DL_PREFIX, arch, UTIL_DL_EXT);
-
-   screen->pLibrary = util_dl_open(filename);
-   if (!screen->pLibrary) {
-      fprintf(stderr, "(skipping: %s).\n", util_dl_error());
-      return false;
-   }
-
-   util_dl_proc pApiProc = util_dl_get_proc_address(screen->pLibrary,
-      "SwrGetInterface");
-   util_dl_proc pTileApiProc = util_dl_get_proc_address(screen->pLibrary,
-      "SwrGetTileIterface");
-   util_dl_proc pInitFunc = util_dl_get_proc_address(screen->pLibrary,
-      "InitTilesTable");
-   if (!pApiProc || !pInitFunc || !pTileApiProc) {
-      fprintf(stderr, "(skipping: %s).\n", util_dl_error());
-      util_dl_close(screen->pLibrary);
-      screen->pLibrary = NULL;
-      return false;
-   }
-
-   screen->pfnSwrGetInterface = (PFNSwrGetInterface)pApiProc;
-   screen->pfnSwrGetTileInterface = (PFNSwrGetTileInterface)pTileApiProc;
-
-   SWR_ASSERT(screen->pfnSwrGetInterface != nullptr);
-   SWR_ASSERT(screen->pfnSwrGetTileInterface != nullptr);
-   SWR_ASSERT(pInitFunc != nullptr);
-
-   pInitFunc();
-
-   swr_print_info("(using: %s).\n", filename);
-#endif
-
-   return true;
-}
-
-
-struct pipe_screen *
-swr_create_screen(struct sw_winsys *winsys)
-{
-   struct pipe_screen *p_screen = swr_create_screen_internal(winsys);
-   if (!p_screen) {
-      return NULL;
-   }
-
-   struct swr_screen *screen = swr_screen(p_screen);
-   screen->is_knl = false;
-
-   util_cpu_detect();
-
-   if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512er) {
-      swr_print_info("SWR detected KNL instruction support ");
-#ifndef HAVE_SWR_KNL
-      swr_print_info("(skipping: not built).\n");
-#else
-      if (swr_initialize_screen_interface(screen, "KNL")) {
-         screen->is_knl = true;
-         return p_screen;
-      }
-#endif
-   }
-
-   if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512bw) {
-      swr_print_info("SWR detected SKX instruction support ");
-#ifndef HAVE_SWR_SKX
-      swr_print_info("(skipping not built).\n");
-#else
-      if (swr_initialize_screen_interface(screen, "SKX"))
-         return p_screen;
-#endif
-   }
-
-   if (util_get_cpu_caps()->has_avx2) {
-      swr_print_info("SWR detected AVX2 instruction support ");
-#ifndef HAVE_SWR_AVX2
-      swr_print_info("(skipping not built).\n");
-#else
-      if (swr_initialize_screen_interface(screen, "AVX2"))
-         return p_screen;
-#endif
-   }
-
-   if (util_get_cpu_caps()->has_avx) {
-      swr_print_info("SWR detected AVX instruction support ");
-#ifndef HAVE_SWR_AVX
-      swr_print_info("(skipping not built).\n");
-#else
-      if (swr_initialize_screen_interface(screen, "AVX"))
-         return p_screen;
-#endif
-   }
-
-   fprintf(stderr, "SWR could not initialize a supported CPU architecture.\n");
-   swr_destroy_screen_internal(&screen);
-
-   return NULL;
-}
-
-
-#ifdef _WIN32
-// swap function called from libl_gdi.c
-
-void
-swr_gdi_swap(struct pipe_screen *screen,
-             struct pipe_context *ctx,
-             struct pipe_resource *res,
-             void *hDC)
-{
-   screen->flush_frontbuffer(screen,
-                             ctx,
-                             res,
-                             0, 0,
-                             hDC,
-                             NULL);
-}
-
-#endif /* _WIN32 */
diff --git a/src/gallium/drivers/swr/swr_memory.h b/src/gallium/drivers/swr/swr_memory.h
deleted file mode 100644
index bf6eaa34758..00000000000
--- a/src/gallium/drivers/swr/swr_memory.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#pragma once
-#include "rasterizer/core/context.h"
-INLINE void
-swr_LoadHotTile(HANDLE hDC,
-                HANDLE hWorkerPrivateData,
-                SWR_FORMAT dstFormat,
-                SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-                UINT x, UINT y,
-                uint32_t renderTargetArrayIndex, uint8_t* pDstHotTile)
-{
-   DRAW_CONTEXT *pDC = (DRAW_CONTEXT*)hDC;
-   swr_draw_context *pSDC = (swr_draw_context*)GetPrivateState(pDC);
-   SWR_SURFACE_STATE *pSrcSurface = &pSDC->renderTargets[renderTargetIndex];
-
-   pSDC->pTileAPI->pfnSwrLoadHotTile(hWorkerPrivateData, pSrcSurface, pDC->pContext->pBucketMgr, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile);
-}
-
-INLINE void
-swr_StoreHotTile(HANDLE hDC,
-                 HANDLE hWorkerPrivateData,
-                 SWR_FORMAT srcFormat,
-                 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-                 UINT x, UINT y,
-                 uint32_t renderTargetArrayIndex, uint8_t* pSrcHotTile)
-{
-   DRAW_CONTEXT *pDC = (DRAW_CONTEXT*)hDC;
-   swr_draw_context *pSDC = (swr_draw_context*)GetPrivateState(pDC);
-   SWR_SURFACE_STATE *pDstSurface = &pSDC->renderTargets[renderTargetIndex];
-
-   pSDC->pTileAPI->pfnSwrStoreHotTileToSurface(hWorkerPrivateData, pDstSurface, pDC->pContext->pBucketMgr, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile);
-}
-
-INLINE gfxptr_t
-swr_MakeGfxPtr(HANDLE hPrivateContext, void* sysAddr)
-{
-    // Fulfill an unused internal interface
-    return (gfxptr_t)sysAddr;
-}
diff --git a/src/gallium/drivers/swr/swr_public.h b/src/gallium/drivers/swr/swr_public.h
deleted file mode 100644
index 2a7d2984cb3..00000000000
--- a/src/gallium/drivers/swr/swr_public.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#ifndef SWR_PUBLIC_H
-#define SWR_PUBLIC_H
-
-struct pipe_screen;
-struct pipe_context;
-struct sw_displaytarget;
-struct sw_winsys;
-struct swr_screen;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// driver entry point
-struct pipe_screen *swr_create_screen(struct sw_winsys *winsys);
-
-// arch-specific dll entry point
-struct pipe_screen *swr_create_screen_internal(struct sw_winsys *winsys);
-
-// cleanup for failed screen creation
-void swr_destroy_screen_internal(struct swr_screen **screen);
-
-#ifdef _WIN32
-void swr_gdi_swap(struct pipe_screen *screen,
-                  struct pipe_context *ctx,
-                  struct pipe_resource *res,
-                  void *hDC);
-#endif /* _WIN32 */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp
deleted file mode 100644
index 005b64fb090..00000000000
--- a/src/gallium/drivers/swr/swr_query.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "util/os_time.h"
-#include "swr_context.h"
-#include "swr_fence.h"
-#include "swr_query.h"
-#include "swr_screen.h"
-#include "swr_state.h"
-#include "common/os.h"
-
-static struct swr_query *
-swr_query(struct pipe_query *p)
-{
-   return (struct swr_query *)p;
-}
-
-static struct pipe_query *
-swr_create_query(struct pipe_context *pipe, unsigned type, unsigned index)
-{
-   struct swr_query *pq;
-
-   assert(type < PIPE_QUERY_TYPES);
-   assert(index < MAX_SO_STREAMS);
-
-   pq = (struct swr_query *) AlignedMalloc(sizeof(struct swr_query), 64);
-
-   if (pq) {
-      memset(pq, 0, sizeof(*pq));
-      pq->type = type;
-      pq->index = index;
-   }
-
-   return (struct pipe_query *)pq;
-}
-
-
-static void
-swr_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
-{
-   struct swr_query *pq = swr_query(q);
-
-   if (pq->fence) {
-      if (swr_is_fence_pending(pq->fence))
-         swr_fence_finish(pipe->screen, NULL, pq->fence, 0);
-      swr_fence_reference(pipe->screen, &pq->fence, NULL);
-   }
-
-   AlignedFree(pq);
-}
-
-
-static bool
-swr_get_query_result(struct pipe_context *pipe,
-                     struct pipe_query *q,
-                     bool wait,
-                     union pipe_query_result *result)
-{
-   struct swr_query *pq = swr_query(q);
-   unsigned index = pq->index;
-
-   if (pq->fence) {
-      if (!wait && !swr_is_fence_done(pq->fence))
-         return false;
-
-      swr_fence_finish(pipe->screen, NULL, pq->fence, 0);
-      swr_fence_reference(pipe->screen, &pq->fence, NULL);
-   }
-
-   /* All values are reset to 0 at swr_begin_query, except starting timestamp.
-    * Counters become simply end values.  */
-   switch (pq->type) {
-   /* Booleans */
-   case PIPE_QUERY_OCCLUSION_PREDICATE:
-   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-      result->b = pq->result.core.DepthPassCount != 0;
-      break;
-   case PIPE_QUERY_GPU_FINISHED:
-      result->b = true;
-      break;
-   /* Counters */
-   case PIPE_QUERY_OCCLUSION_COUNTER:
-      result->u64 = pq->result.core.DepthPassCount;
-      break;
-   case PIPE_QUERY_TIMESTAMP:
-   case PIPE_QUERY_TIME_ELAPSED:
-      result->u64 = pq->result.timestamp_end - pq->result.timestamp_start;
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED:
-      result->u64 = pq->result.coreFE.IaPrimitives;
-      break;
-   case PIPE_QUERY_PRIMITIVES_EMITTED:
-      result->u64 = pq->result.coreFE.SoNumPrimsWritten[index];
-      break;
-   /* Structures */
-   case PIPE_QUERY_SO_STATISTICS: {
-      struct pipe_query_data_so_statistics *so_stats = &result->so_statistics;
-      so_stats->num_primitives_written =
-         pq->result.coreFE.SoNumPrimsWritten[index];
-      so_stats->primitives_storage_needed =
-         pq->result.coreFE.SoPrimStorageNeeded[index];
-   } break;
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      /* os_get_time_nano returns nanoseconds */
-      result->timestamp_disjoint.frequency = UINT64_C(1000000000);
-      result->timestamp_disjoint.disjoint = FALSE;
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS: {
-      struct pipe_query_data_pipeline_statistics *p_stats =
-         &result->pipeline_statistics;
-      p_stats->ia_vertices = pq->result.coreFE.IaVertices;
-      p_stats->ia_primitives = pq->result.coreFE.IaPrimitives;
-      p_stats->vs_invocations = pq->result.coreFE.VsInvocations;
-      p_stats->gs_invocations = pq->result.coreFE.GsInvocations;
-      p_stats->gs_primitives = pq->result.coreFE.GsPrimitives;
-      p_stats->c_invocations = pq->result.coreFE.CPrimitives;
-      p_stats->c_primitives = pq->result.coreFE.CPrimitives;
-      p_stats->ps_invocations = pq->result.core.PsInvocations;
-      p_stats->hs_invocations = pq->result.coreFE.HsInvocations;
-      p_stats->ds_invocations = pq->result.coreFE.DsInvocations;
-      p_stats->cs_invocations = pq->result.core.CsInvocations;
-    } break;
-   case PIPE_QUERY_SO_OVERFLOW_PREDICATE: {
-      uint64_t num_primitives_written =
-         pq->result.coreFE.SoNumPrimsWritten[index];
-      uint64_t primitives_storage_needed =
-         pq->result.coreFE.SoPrimStorageNeeded[index];
-      result->b = num_primitives_written > primitives_storage_needed;
-   }
-      break;
-   default:
-      assert(0 && "Unsupported query");
-      break;
-   }
-
-   return true;
-}
-
-static bool
-swr_begin_query(struct pipe_context *pipe, struct pipe_query *q)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   struct swr_query *pq = swr_query(q);
-
-   /* Initialize Results */
-   memset(&pq->result, 0, sizeof(pq->result));
-   switch (pq->type) {
-   case PIPE_QUERY_GPU_FINISHED:
-   case PIPE_QUERY_TIMESTAMP:
-      /* nothing to do, but don't want the default */
-      break;
-   case PIPE_QUERY_TIME_ELAPSED:
-      pq->result.timestamp_start = swr_get_timestamp(pipe->screen);
-      break;
-   default:
-      /* Core counters required.  Update draw context with location to
-       * store results. */
-      swr_update_draw_context(ctx, &pq->result);
-
-      /* Only change stat collection if there are no active queries */
-      if (ctx->active_queries == 0) {
-         ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, TRUE);
-         ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, TRUE);
-      }
-      ctx->active_queries++;
-      break;
-   }
-
-
-   return true;
-}
-
-static bool
-swr_end_query(struct pipe_context *pipe, struct pipe_query *q)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   struct swr_query *pq = swr_query(q);
-
-   switch (pq->type) {
-   case PIPE_QUERY_GPU_FINISHED:
-      /* nothing to do, but don't want the default */
-      break;
-   case PIPE_QUERY_TIMESTAMP:
-   case PIPE_QUERY_TIME_ELAPSED:
-      pq->result.timestamp_end = swr_get_timestamp(pipe->screen);
-      break;
-   default:
-      /* Stats are updated asynchronously, a fence is used to signal
-       * completion. */
-      if (!pq->fence) {
-         struct swr_screen *screen = swr_screen(pipe->screen);
-         swr_fence_reference(pipe->screen, &pq->fence, screen->flush_fence);
-      }
-      swr_fence_submit(ctx, pq->fence);
-
-      /* Only change stat collection if there are no active queries */
-      ctx->active_queries--;
-      if (ctx->active_queries == 0) {
-         ctx->api.pfnSwrEnableStatsFE(ctx->swrContext, FALSE);
-         ctx->api.pfnSwrEnableStatsBE(ctx->swrContext, FALSE);
-      }
-
-      break;
-   }
-
-   return true;
-}
-
-
-bool
-swr_check_render_cond(struct pipe_context *pipe)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   bool b, wait;
-   uint64_t result;
-
-   if (!ctx->render_cond_query)
-      return true; /* no query predicate, draw normally */
-
-   wait = (ctx->render_cond_mode == PIPE_RENDER_COND_WAIT
-           || ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT);
-
-   b = pipe->get_query_result(
-      pipe, ctx->render_cond_query, wait, (union pipe_query_result *)&result);
-   if (b)
-      return ((!result) == ctx->render_cond_cond);
-   else
-      return true;
-}
-
-
-static void
-swr_set_active_query_state(struct pipe_context *pipe, bool enable)
-{
-}
-
-void
-swr_query_init(struct pipe_context *pipe)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   pipe->create_query = swr_create_query;
-   pipe->destroy_query = swr_destroy_query;
-   pipe->begin_query = swr_begin_query;
-   pipe->end_query = swr_end_query;
-   pipe->get_query_result = swr_get_query_result;
-   pipe->set_active_query_state = swr_set_active_query_state;
-
-   ctx->active_queries = 0;
-}
diff --git a/src/gallium/drivers/swr/swr_query.h b/src/gallium/drivers/swr/swr_query.h
deleted file mode 100644
index d838dc859e2..00000000000
--- a/src/gallium/drivers/swr/swr_query.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#ifndef SWR_QUERY_H
-#define SWR_QUERY_H
-
-
-#include <limits.h>
-
-struct swr_query_result {
-   SWR_STATS core;
-   SWR_STATS_FE coreFE;
-   uint64_t timestamp_start;
-   uint64_t timestamp_end;
-};
-
-OSALIGNLINE(struct) swr_query {
-   unsigned type; /* PIPE_QUERY_* */
-   unsigned index;
-
-   struct swr_query_result result;
-   struct pipe_fence_handle *fence;
-};
-
-extern void swr_query_init(struct pipe_context *pipe);
-
-extern bool swr_check_render_cond(struct pipe_context *pipe);
-#endif
diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h
deleted file mode 100644
index 2228dff7488..00000000000
--- a/src/gallium/drivers/swr/swr_resource.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#ifndef SWR_RESOURCE_H
-#define SWR_RESOURCE_H
-
-#include "memory/SurfaceState.h"
-#include "pipe/p_state.h"
-#include "api.h"
-
-struct sw_displaytarget;
-
-enum swr_resource_status {
-   SWR_RESOURCE_UNUSED = 0x0,
-   SWR_RESOURCE_READ = 0x1,
-   SWR_RESOURCE_WRITE = 0x2,
-};
-
-struct swr_resource {
-   struct pipe_resource base;
-
-   bool has_depth;
-   bool has_stencil;
-
-   SWR_SURFACE_STATE swr;
-   SWR_SURFACE_STATE secondary; /* for faking depth/stencil merged formats */
-
-   struct sw_displaytarget *display_target;
-
-   /* If resource is multisample, then this points to a alternate resource
-    * containing the resolved multisample surface, otherwise null */
-   struct pipe_resource *resolve_target;
-
-   size_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS];
-   size_t secondary_mip_offsets[PIPE_MAX_TEXTURE_LEVELS];
-
-   enum swr_resource_status status;
-
-   /* last pipe that used (validated) this resource */
-   struct pipe_context *curr_pipe;
-};
-
-
-static INLINE struct swr_resource *
-swr_resource(struct pipe_resource *resource)
-{
-   return (struct swr_resource *)resource;
-}
-
-static INLINE bool
-swr_resource_is_texture(const struct pipe_resource *resource)
-{
-   switch (resource->target) {
-   case PIPE_BUFFER:
-      return false;
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_2D_ARRAY:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_3D:
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-      return true;
-   default:
-      assert(0);
-      return false;
-   }
-}
-
-
-static INLINE uint8_t *
-swr_resource_data(struct pipe_resource *resource)
-{
-   struct swr_resource *swr_r = swr_resource(resource);
-
-   assert(!swr_resource_is_texture(resource));
-
-   return (uint8_t*)(swr_r->swr.xpBaseAddress);
-}
-
-
-void swr_invalidate_render_target(struct pipe_context *pipe,
-                                  uint32_t attachment,
-                                  uint16_t width, uint16_t height);
-
-void swr_store_render_target(struct pipe_context *pipe,
-                             uint32_t attachment,
-                             enum SWR_TILE_STATE post_tile_state);
-
-void swr_store_dirty_resource(struct pipe_context *pipe,
-                              struct pipe_resource *resource,
-                              enum SWR_TILE_STATE post_tile_state);
-
-void swr_update_resource_status(struct pipe_context *,
-                                const struct pipe_draw_info *);
-
-/*
- * Functions to indicate a resource's in-use status.
- */
-static INLINE enum
-swr_resource_status & operator|=(enum swr_resource_status & a,
-                                 enum swr_resource_status  b) {
-   return (enum swr_resource_status &)((int&)a |= (int)b);
-}
-
-static INLINE void
-swr_resource_read(struct pipe_resource *resource)
-{
-   swr_resource(resource)->status |= SWR_RESOURCE_READ;
-}
-
-static INLINE void
-swr_resource_write(struct pipe_resource *resource)
-{
-   swr_resource(resource)->status |= SWR_RESOURCE_WRITE;
-}
-
-static INLINE void
-swr_resource_unused(struct pipe_resource *resource)
-{
-   swr_resource(resource)->status = SWR_RESOURCE_UNUSED;
-}
-
-#endif
diff --git a/src/gallium/drivers/swr/swr_scratch.cpp b/src/gallium/drivers/swr/swr_scratch.cpp
deleted file mode 100644
index 66f18365cc7..00000000000
--- a/src/gallium/drivers/swr/swr_scratch.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include "util/u_memory.h"
-#include "swr_context.h"
-#include "swr_screen.h"
-#include "swr_scratch.h"
-#include "swr_fence.h"
-#include "swr_fence_work.h"
-#include "api.h"
-
-void *
-swr_copy_to_scratch_space(struct swr_context *ctx,
-                          struct swr_scratch_space *space,
-                          const void *user_buffer,
-                          unsigned int size)
-{
-   void *ptr;
-   assert(space);
-   assert(size);
-
-   /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */
-   uint32_t max_size_in_flight = size * ctx->max_draws_in_flight;
-
-   /* Need to grow space */
-   if (max_size_in_flight > space->current_size) {
-      space->current_size = max_size_in_flight;
-
-      if (space->base) {
-         /* defer delete, use aligned-free, fence finish enforces the defer
-          * delete will be on the *next* fence */
-         struct swr_screen *screen = swr_screen(ctx->pipe.screen);
-         swr_fence_finish(ctx->pipe.screen, NULL, screen->flush_fence, 0);
-         swr_fence_work_free(screen->flush_fence, space->base, true);
-         space->base = NULL;
-      }
-
-      if (!space->base) {
-         space->base = (uint8_t *)AlignedMalloc(space->current_size,
-                                                sizeof(void *));
-         space->head = (void *)space->base;
-      }
-   }
-
-   /* Wrap */
-   if (((uint8_t *)space->head + size)
-       >= ((uint8_t *)space->base + space->current_size)) {
-      space->head = space->base;
-   }
-
-   ptr = space->head;
-   space->head = (uint8_t *)space->head + size;
-
-   /* Copy user_buffer to scratch */
-   if (user_buffer)
-      memcpy(ptr, user_buffer, size);
-
-   return ptr;
-}
-
-
-void
-swr_init_scratch_buffers(struct swr_context *ctx)
-{
-   struct swr_scratch_buffers *scratch;
-
-   scratch = CALLOC_STRUCT(swr_scratch_buffers);
-   ctx->scratch = scratch;
-}
-
-void
-swr_destroy_scratch_buffers(struct swr_context *ctx)
-{
-   struct swr_scratch_buffers *scratch = ctx->scratch;
-
-   if (scratch) {
-      AlignedFree(scratch->vs_constants.base);
-      AlignedFree(scratch->fs_constants.base);
-      AlignedFree(scratch->gs_constants.base);
-      AlignedFree(scratch->tcs_constants.base);
-      AlignedFree(scratch->tes_constants.base);
-      AlignedFree(scratch->vertex_buffer.base);
-      AlignedFree(scratch->index_buffer.base);
-      FREE(scratch);
-   }
-}
diff --git a/src/gallium/drivers/swr/swr_scratch.h b/src/gallium/drivers/swr/swr_scratch.h
deleted file mode 100644
index 4d1c82fc6fc..00000000000
--- a/src/gallium/drivers/swr/swr_scratch.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#ifndef SWR_SCRATCH_H
-#define SWR_SCRATCH_H
-
-struct swr_scratch_space {
-   void *head;
-   unsigned int current_size;
-   /* TODO XXX: Add a fence for wrap condition. */
-
-   void *base;
-};
-
-struct swr_scratch_buffers {
-   struct swr_scratch_space vs_constants;
-   struct swr_scratch_space fs_constants;
-   struct swr_scratch_space gs_constants;
-   struct swr_scratch_space tcs_constants;
-   struct swr_scratch_space tes_constants;
-   struct swr_scratch_space vertex_buffer;
-   struct swr_scratch_space index_buffer;
-};
-
-
-/*
- * swr_copy_to_scratch_space
- * Copies size bytes of user_buffer into the scratch ring buffer.
- * Used to store temporary data such as client arrays and constants.
- *
- * Inputs:
- *   space ptr to scratch pool (vs_constants, fs_constants)
- *   user_buffer, data to copy into scratch space
- *   size to be copied
- * Returns:
- *   pointer to data copied to scratch space.
- */
-void *swr_copy_to_scratch_space(struct swr_context *ctx,
-                                struct swr_scratch_space *space,
-                                const void *user_buffer,
-                                unsigned int size);
-
-void swr_init_scratch_buffers(struct swr_context *ctx);
-void swr_destroy_scratch_buffers(struct swr_context *ctx);
-
-#endif
diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
deleted file mode 100644
index 4c274fd86e5..00000000000
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ /dev/null
@@ -1,1155 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include "swr_context.h"
-#include "swr_public.h"
-#include "swr_screen.h"
-#include "swr_resource.h"
-#include "swr_fence.h"
-#include "gen_knobs.h"
-
-#include "pipe/p_screen.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "util/format/u_format.h"
-#include "util/u_inlines.h"
-#include "util/u_cpu_detect.h"
-#include "util/format/u_format_s3tc.h"
-#include "util/u_string.h"
-#include "util/u_screen.h"
-
-#include "frontend/sw_winsys.h"
-
-#include "jit_api.h"
-
-#include "memory/TilingFunctions.h"
-
-#include <stdio.h>
-#include <map>
-
-/*
- * Max texture sizes
- * XXX Check max texture size values against core and sampler.
- */
-#define SWR_MAX_TEXTURE_SIZE (2 * 1024 * 1024 * 1024ULL) /* 2GB */
-/* Not all texture formats can fit into 2GB limit, but we have to
-   live with that. See lp_limits.h for more details */
-#define SWR_MAX_TEXTURE_2D_SIZE 16384
-#define SWR_MAX_TEXTURE_3D_LEVELS 12  /* 2K x 2K x 2K for now */
-#define SWR_MAX_TEXTURE_CUBE_LEVELS 14  /* 8K x 8K for now */
-#define SWR_MAX_TEXTURE_ARRAY_LAYERS 512 /* 8K x 512 / 8K x 8K x 512 */
-
-/* Default max client_copy_limit */
-#define SWR_CLIENT_COPY_LIMIT 8192
-
-/* Flag indicates creation of alternate surface, to prevent recursive loop
- * in resource creation when msaa_force_enable is set. */
-#define SWR_RESOURCE_FLAG_ALT_SURFACE (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
-
-
-static const char *
-swr_get_name(struct pipe_screen *screen)
-{
-   static char buf[100];
-   snprintf(buf, sizeof(buf), "SWR (LLVM " MESA_LLVM_VERSION_STRING ", %u bits)",
-            lp_native_vector_width);
-   return buf;
-}
-
-static const char *
-swr_get_vendor(struct pipe_screen *screen)
-{
-   return "Intel Corporation";
-}
-
-static bool
-swr_is_format_supported(struct pipe_screen *_screen,
-                        enum pipe_format format,
-                        enum pipe_texture_target target,
-                        unsigned sample_count,
-                        unsigned storage_sample_count,
-                        unsigned bind)
-{
-   struct swr_screen *screen = swr_screen(_screen);
-   struct sw_winsys *winsys = screen->winsys;
-   const struct util_format_description *format_desc;
-
-   assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D
-          || target == PIPE_TEXTURE_1D_ARRAY
-          || target == PIPE_TEXTURE_2D
-          || target == PIPE_TEXTURE_2D_ARRAY
-          || target == PIPE_TEXTURE_RECT
-          || target == PIPE_TEXTURE_3D
-          || target == PIPE_TEXTURE_CUBE
-          || target == PIPE_TEXTURE_CUBE_ARRAY);
-
-   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
-      return false;
-
-   format_desc = util_format_description(format);
-   if (!format_desc)
-      return false;
-
-   if ((sample_count > screen->msaa_max_count)
-      || !util_is_power_of_two_or_zero(sample_count))
-      return false;
-
-   if (bind & PIPE_BIND_DISPLAY_TARGET) {
-      if (!winsys->is_displaytarget_format_supported(winsys, bind, format))
-         return false;
-   }
-
-   if (bind & PIPE_BIND_RENDER_TARGET) {
-      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS)
-         return false;
-
-      if (mesa_to_swr_format(format) == (SWR_FORMAT)-1)
-         return false;
-
-      /*
-       * Although possible, it is unnatural to render into compressed or YUV
-       * surfaces. So disable these here to avoid going into weird paths
-       * inside gallium frontends.
-       */
-      if (format_desc->block.width != 1 || format_desc->block.height != 1)
-         return false;
-   }
-
-   if (bind & PIPE_BIND_DEPTH_STENCIL) {
-      if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
-         return false;
-
-      if (mesa_to_swr_format(format) == (SWR_FORMAT)-1)
-         return false;
-   }
-
-   if (bind & PIPE_BIND_VERTEX_BUFFER) {
-      if (mesa_to_swr_format(format) == (SWR_FORMAT)-1) {
-         return false;
-      }
-   }
-
-   if (format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC ||
-       format_desc->layout == UTIL_FORMAT_LAYOUT_FXT1)
-   {
-      return false;
-   }
-
-   if (format_desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
-       format != PIPE_FORMAT_ETC1_RGB8) {
-      return false;
-   }
-
-   if ((bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW)) &&
-       ((bind & PIPE_BIND_DISPLAY_TARGET) == 0)) {
-      /* Disable all 3-channel formats, where channel size != 32 bits.
-       * In some cases we run into crashes (in generate_unswizzled_blend()),
-       * for 3-channel RGB16 variants, there was an apparent LLVM bug.
-       * In any case, disabling the shallower 3-channel formats avoids a
-       * number of issues with GL_ARB_copy_image support.
-       */
-      if (format_desc->is_array &&
-          format_desc->nr_channels == 3 &&
-          format_desc->block.bits != 96) {
-         return false;
-      }
-   }
-
-   return TRUE;
-}
-
-static int
-swr_get_param(struct pipe_screen *screen, enum pipe_cap param)
-{
-   switch (param) {
-      /* limits */
-   case PIPE_CAP_MAX_RENDER_TARGETS:
-      return PIPE_MAX_COLOR_BUFS;
-   case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
-      return SWR_MAX_TEXTURE_2D_SIZE;
-   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-      return SWR_MAX_TEXTURE_3D_LEVELS;
-   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return SWR_MAX_TEXTURE_CUBE_LEVELS;
-   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-      return MAX_SO_STREAMS;
-   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-      return MAX_ATTRIBUTES * 4;
-   case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
-   case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
-      return 1024;
-   case PIPE_CAP_MAX_VERTEX_STREAMS:
-      return 4;
-   case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
-      return 2048;
-   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-      return SWR_MAX_TEXTURE_ARRAY_LAYERS;
-   case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
-   case PIPE_CAP_MIN_TEXEL_OFFSET:
-      return -8;
-   case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
-   case PIPE_CAP_MAX_TEXEL_OFFSET:
-      return 7;
-   case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
-      return 4;
-   case PIPE_CAP_GLSL_FEATURE_LEVEL:
-      return 330;
-   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
-      return 140;
-   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
-      return 16;
-   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
-      return 64;
-   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
-      return 65536;
-   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
-      return 1;
-   case PIPE_CAP_MAX_VIEWPORTS:
-      return KNOB_NUM_VIEWPORTS_SCISSORS;
-   case PIPE_CAP_ENDIANNESS:
-      return PIPE_ENDIAN_NATIVE;
-
-      /* supported features */
-   case PIPE_CAP_NPOT_TEXTURES:
-   case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
-   case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
-   case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
-   case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
-   case PIPE_CAP_VERTEX_SHADER_SATURATE:
-   case PIPE_CAP_POINT_SPRITE:
-   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
-   case PIPE_CAP_OCCLUSION_QUERY:
-   case PIPE_CAP_QUERY_TIME_ELAPSED:
-   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
-   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-   case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
-   case PIPE_CAP_TEXTURE_SWIZZLE:
-   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
-   case PIPE_CAP_INDEP_BLEND_ENABLE:
-   case PIPE_CAP_INDEP_BLEND_FUNC:
-   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
-   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
-   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
-   case PIPE_CAP_DEPTH_CLIP_DISABLE:
-   case PIPE_CAP_PRIMITIVE_RESTART:
-   case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX:
-   case PIPE_CAP_TGSI_INSTANCEID:
-   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
-   case PIPE_CAP_START_INSTANCE:
-   case PIPE_CAP_SEAMLESS_CUBE_MAP:
-   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
-   case PIPE_CAP_CONDITIONAL_RENDER:
-   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
-   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
-   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
-   case PIPE_CAP_USER_VERTEX_BUFFERS:
-   case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
-   case PIPE_CAP_QUERY_TIMESTAMP:
-   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
-   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
-   case PIPE_CAP_DRAW_INDIRECT:
-   case PIPE_CAP_UMA:
-   case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
-   case PIPE_CAP_CLIP_HALFZ:
-   case PIPE_CAP_POLYGON_OFFSET_CLAMP:
-   case PIPE_CAP_DEPTH_BOUNDS_TEST:
-   case PIPE_CAP_CLEAR_TEXTURE:
-   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
-   case PIPE_CAP_CULL_DISTANCE:
-   case PIPE_CAP_CUBE_MAP_ARRAY:
-   case PIPE_CAP_DOUBLES:
-   case PIPE_CAP_TEXTURE_QUERY_LOD:
-   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
-   case PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE:
-   case PIPE_CAP_QUERY_SO_OVERFLOW:
-   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
-   case PIPE_CAP_IMAGE_STORE_FORMATTED:
-      return 1;
-
-   case PIPE_CAP_SHAREABLE_SHADERS:
-      return 0;
-
-   /* MSAA support
-    * If user has explicitly set max_sample_count = 1 (via SWR_MSAA_MAX_COUNT)
-    * then disable all MSAA support and go back to old (FAKE_SW_MSAA) caps. */
-   case PIPE_CAP_TEXTURE_MULTISAMPLE:
-   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
-      return (swr_screen(screen)->msaa_max_count > 1) ? 1 : 0;
-   case PIPE_CAP_FAKE_SW_MSAA:
-      return (swr_screen(screen)->msaa_max_count > 1) ? 0 : 1;
-
-   /* fetch jit change for 2-4GB buffers requires alignment */
-   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
-   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
-   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-      return 1;
-
-      /* unsupported features */
-   case PIPE_CAP_TEXTURE_TRANSFER_MODES:
-   case PIPE_CAP_PCI_GROUP:
-   case PIPE_CAP_PCI_BUS:
-   case PIPE_CAP_PCI_DEVICE:
-   case PIPE_CAP_PCI_FUNCTION:
-   case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
-      return 0;
-   case PIPE_CAP_MAX_GS_INVOCATIONS:
-      return 32;
-   case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
-      return 1 << 27;
-   case PIPE_CAP_MAX_VARYINGS:
-      return 32;
-
-   case PIPE_CAP_VENDOR_ID:
-      return 0xFFFFFFFF;
-   case PIPE_CAP_DEVICE_ID:
-      return 0xFFFFFFFF;
-   case PIPE_CAP_ACCELERATED:
-      return 0;
-   case PIPE_CAP_VIDEO_MEMORY: {
-      /* XXX: Do we want to return the full amount of system memory ? */
-      uint64_t system_memory;
-
-      if (!os_get_total_physical_memory(&system_memory))
-         return 0;
-
-      return (int)(system_memory >> 20);
-   }
-   default:
-      return u_pipe_screen_get_param_defaults(screen, param);
-   }
-}
-
-static int
-swr_get_shader_param(struct pipe_screen *screen,
-                     enum pipe_shader_type shader,
-                     enum pipe_shader_cap param)
-{
-   if (shader != PIPE_SHADER_VERTEX &&
-       shader != PIPE_SHADER_FRAGMENT &&
-       shader != PIPE_SHADER_GEOMETRY &&
-       shader != PIPE_SHADER_TESS_CTRL &&
-       shader != PIPE_SHADER_TESS_EVAL)
-      return 0;
-
-   if (param == PIPE_SHADER_CAP_MAX_SHADER_BUFFERS ||
-       param == PIPE_SHADER_CAP_MAX_SHADER_IMAGES) {
-      return 0;
-   }
-
-   return gallivm_get_shader_param(param);
-}
-
-
-static float
-swr_get_paramf(struct pipe_screen *screen, enum pipe_capf param)
-{
-   switch (param) {
-   case PIPE_CAPF_MIN_LINE_WIDTH:
-   case PIPE_CAPF_MIN_LINE_WIDTH_AA:
-   case PIPE_CAPF_MIN_POINT_SIZE:
-   case PIPE_CAPF_MIN_POINT_SIZE_AA:
-      return 1;
-   case PIPE_CAPF_POINT_SIZE_GRANULARITY:
-   case PIPE_CAPF_LINE_WIDTH_GRANULARITY:
-      return 0.1;
-   case PIPE_CAPF_MAX_LINE_WIDTH:
-   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
-   case PIPE_CAPF_MAX_POINT_SIZE:
-      return 255.0; /* arbitrary */
-   case PIPE_CAPF_MAX_POINT_SIZE_AA:
-      return 0.0;
-   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
-      return 0.0;
-   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
-      return 16.0; /* arbitrary */
-   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
-   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
-   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
-      return 0.0f;
-   }
-   /* should only get here on unhandled cases */
-   debug_printf("Unexpected PIPE_CAPF %d query\n", param);
-   return 0.0;
-}
-
-SWR_FORMAT
-mesa_to_swr_format(enum pipe_format format)
-{
-   static const std::map<pipe_format,SWR_FORMAT> mesa2swr = {
-      /* depth / stencil */
-      {PIPE_FORMAT_Z16_UNORM,              R16_UNORM}, // z
-      {PIPE_FORMAT_Z32_FLOAT,              R32_FLOAT}, // z
-      {PIPE_FORMAT_Z24_UNORM_S8_UINT,      R24_UNORM_X8_TYPELESS}, // z
-      {PIPE_FORMAT_Z24X8_UNORM,            R24_UNORM_X8_TYPELESS}, // z
-      {PIPE_FORMAT_Z32_FLOAT_S8X24_UINT,   R32_FLOAT_X8X24_TYPELESS}, // z
-
-      /* alpha */
-      {PIPE_FORMAT_A8_UNORM,               A8_UNORM},
-      {PIPE_FORMAT_A16_UNORM,              A16_UNORM},
-      {PIPE_FORMAT_A16_FLOAT,              A16_FLOAT},
-      {PIPE_FORMAT_A32_FLOAT,              A32_FLOAT},
-
-      /* odd sizes, bgr */
-      {PIPE_FORMAT_B5G6R5_UNORM,           B5G6R5_UNORM},
-      {PIPE_FORMAT_B5G6R5_SRGB,            B5G6R5_UNORM_SRGB},
-      {PIPE_FORMAT_B5G5R5A1_UNORM,         B5G5R5A1_UNORM},
-      {PIPE_FORMAT_B5G5R5X1_UNORM,         B5G5R5X1_UNORM},
-      {PIPE_FORMAT_B4G4R4A4_UNORM,         B4G4R4A4_UNORM},
-      {PIPE_FORMAT_B8G8R8A8_UNORM,         B8G8R8A8_UNORM},
-      {PIPE_FORMAT_B8G8R8A8_SRGB,          B8G8R8A8_UNORM_SRGB},
-      {PIPE_FORMAT_B8G8R8X8_UNORM,         B8G8R8X8_UNORM},
-      {PIPE_FORMAT_B8G8R8X8_SRGB,          B8G8R8X8_UNORM_SRGB},
-
-      /* rgb10a2 */
-      {PIPE_FORMAT_R10G10B10A2_UNORM,      R10G10B10A2_UNORM},
-      {PIPE_FORMAT_R10G10B10A2_SNORM,      R10G10B10A2_SNORM},
-      {PIPE_FORMAT_R10G10B10A2_USCALED,    R10G10B10A2_USCALED},
-      {PIPE_FORMAT_R10G10B10A2_SSCALED,    R10G10B10A2_SSCALED},
-      {PIPE_FORMAT_R10G10B10A2_UINT,       R10G10B10A2_UINT},
-
-      /* rgb10x2 */
-      {PIPE_FORMAT_R10G10B10X2_USCALED,    R10G10B10X2_USCALED},
-
-      /* bgr10a2 */
-      {PIPE_FORMAT_B10G10R10A2_UNORM,      B10G10R10A2_UNORM},
-      {PIPE_FORMAT_B10G10R10A2_SNORM,      B10G10R10A2_SNORM},
-      {PIPE_FORMAT_B10G10R10A2_USCALED,    B10G10R10A2_USCALED},
-      {PIPE_FORMAT_B10G10R10A2_SSCALED,    B10G10R10A2_SSCALED},
-      {PIPE_FORMAT_B10G10R10A2_UINT,       B10G10R10A2_UINT},
-
-      /* bgr10x2 */
-      {PIPE_FORMAT_B10G10R10X2_UNORM,      B10G10R10X2_UNORM},
-
-      /* r11g11b10 */
-      {PIPE_FORMAT_R11G11B10_FLOAT,        R11G11B10_FLOAT},
-
-      /* 32 bits per component */
-      {PIPE_FORMAT_R32_FLOAT,              R32_FLOAT},
-      {PIPE_FORMAT_R32G32_FLOAT,           R32G32_FLOAT},
-      {PIPE_FORMAT_R32G32B32_FLOAT,        R32G32B32_FLOAT},
-      {PIPE_FORMAT_R32G32B32A32_FLOAT,     R32G32B32A32_FLOAT},
-      {PIPE_FORMAT_R32G32B32X32_FLOAT,     R32G32B32X32_FLOAT},
-
-      {PIPE_FORMAT_R32_USCALED,            R32_USCALED},
-      {PIPE_FORMAT_R32G32_USCALED,         R32G32_USCALED},
-      {PIPE_FORMAT_R32G32B32_USCALED,      R32G32B32_USCALED},
-      {PIPE_FORMAT_R32G32B32A32_USCALED,   R32G32B32A32_USCALED},
-
-      {PIPE_FORMAT_R32_SSCALED,            R32_SSCALED},
-      {PIPE_FORMAT_R32G32_SSCALED,         R32G32_SSCALED},
-      {PIPE_FORMAT_R32G32B32_SSCALED,      R32G32B32_SSCALED},
-      {PIPE_FORMAT_R32G32B32A32_SSCALED,   R32G32B32A32_SSCALED},
-
-      {PIPE_FORMAT_R32_UINT,               R32_UINT},
-      {PIPE_FORMAT_R32G32_UINT,            R32G32_UINT},
-      {PIPE_FORMAT_R32G32B32_UINT,         R32G32B32_UINT},
-      {PIPE_FORMAT_R32G32B32A32_UINT,      R32G32B32A32_UINT},
-
-      {PIPE_FORMAT_R32_SINT,               R32_SINT},
-      {PIPE_FORMAT_R32G32_SINT,            R32G32_SINT},
-      {PIPE_FORMAT_R32G32B32_SINT,         R32G32B32_SINT},
-      {PIPE_FORMAT_R32G32B32A32_SINT,      R32G32B32A32_SINT},
-
-      /* 16 bits per component */
-      {PIPE_FORMAT_R16_UNORM,              R16_UNORM},
-      {PIPE_FORMAT_R16G16_UNORM,           R16G16_UNORM},
-      {PIPE_FORMAT_R16G16B16_UNORM,        R16G16B16_UNORM},
-      {PIPE_FORMAT_R16G16B16A16_UNORM,     R16G16B16A16_UNORM},
-      {PIPE_FORMAT_R16G16B16X16_UNORM,     R16G16B16X16_UNORM},
-
-      {PIPE_FORMAT_R16_USCALED,            R16_USCALED},
-      {PIPE_FORMAT_R16G16_USCALED,         R16G16_USCALED},
-      {PIPE_FORMAT_R16G16B16_USCALED,      R16G16B16_USCALED},
-      {PIPE_FORMAT_R16G16B16A16_USCALED,   R16G16B16A16_USCALED},
-
-      {PIPE_FORMAT_R16_SNORM,              R16_SNORM},
-      {PIPE_FORMAT_R16G16_SNORM,           R16G16_SNORM},
-      {PIPE_FORMAT_R16G16B16_SNORM,        R16G16B16_SNORM},
-      {PIPE_FORMAT_R16G16B16A16_SNORM,     R16G16B16A16_SNORM},
-
-      {PIPE_FORMAT_R16_SSCALED,            R16_SSCALED},
-      {PIPE_FORMAT_R16G16_SSCALED,         R16G16_SSCALED},
-      {PIPE_FORMAT_R16G16B16_SSCALED,      R16G16B16_SSCALED},
-      {PIPE_FORMAT_R16G16B16A16_SSCALED,   R16G16B16A16_SSCALED},
-
-      {PIPE_FORMAT_R16_UINT,               R16_UINT},
-      {PIPE_FORMAT_R16G16_UINT,            R16G16_UINT},
-      {PIPE_FORMAT_R16G16B16_UINT,         R16G16B16_UINT},
-      {PIPE_FORMAT_R16G16B16A16_UINT,      R16G16B16A16_UINT},
-
-      {PIPE_FORMAT_R16_SINT,               R16_SINT},
-      {PIPE_FORMAT_R16G16_SINT,            R16G16_SINT},
-      {PIPE_FORMAT_R16G16B16_SINT,         R16G16B16_SINT},
-      {PIPE_FORMAT_R16G16B16A16_SINT,      R16G16B16A16_SINT},
-
-      {PIPE_FORMAT_R16_FLOAT,              R16_FLOAT},
-      {PIPE_FORMAT_R16G16_FLOAT,           R16G16_FLOAT},
-      {PIPE_FORMAT_R16G16B16_FLOAT,        R16G16B16_FLOAT},
-      {PIPE_FORMAT_R16G16B16A16_FLOAT,     R16G16B16A16_FLOAT},
-      {PIPE_FORMAT_R16G16B16X16_FLOAT,     R16G16B16X16_FLOAT},
-
-      /* 8 bits per component */
-      {PIPE_FORMAT_R8_UNORM,               R8_UNORM},
-      {PIPE_FORMAT_R8G8_UNORM,             R8G8_UNORM},
-      {PIPE_FORMAT_R8G8B8_UNORM,           R8G8B8_UNORM},
-      {PIPE_FORMAT_R8G8B8_SRGB,            R8G8B8_UNORM_SRGB},
-      {PIPE_FORMAT_R8G8B8A8_UNORM,         R8G8B8A8_UNORM},
-      {PIPE_FORMAT_R8G8B8A8_SRGB,          R8G8B8A8_UNORM_SRGB},
-      {PIPE_FORMAT_R8G8B8X8_UNORM,         R8G8B8X8_UNORM},
-      {PIPE_FORMAT_R8G8B8X8_SRGB,          R8G8B8X8_UNORM_SRGB},
-
-      {PIPE_FORMAT_R8_USCALED,             R8_USCALED},
-      {PIPE_FORMAT_R8G8_USCALED,           R8G8_USCALED},
-      {PIPE_FORMAT_R8G8B8_USCALED,         R8G8B8_USCALED},
-      {PIPE_FORMAT_R8G8B8A8_USCALED,       R8G8B8A8_USCALED},
-
-      {PIPE_FORMAT_R8_SNORM,               R8_SNORM},
-      {PIPE_FORMAT_R8G8_SNORM,             R8G8_SNORM},
-      {PIPE_FORMAT_R8G8B8_SNORM,           R8G8B8_SNORM},
-      {PIPE_FORMAT_R8G8B8A8_SNORM,         R8G8B8A8_SNORM},
-
-      {PIPE_FORMAT_R8_SSCALED,             R8_SSCALED},
-      {PIPE_FORMAT_R8G8_SSCALED,           R8G8_SSCALED},
-      {PIPE_FORMAT_R8G8B8_SSCALED,         R8G8B8_SSCALED},
-      {PIPE_FORMAT_R8G8B8A8_SSCALED,       R8G8B8A8_SSCALED},
-
-      {PIPE_FORMAT_R8_UINT,                R8_UINT},
-      {PIPE_FORMAT_R8G8_UINT,              R8G8_UINT},
-      {PIPE_FORMAT_R8G8B8_UINT,            R8G8B8_UINT},
-      {PIPE_FORMAT_R8G8B8A8_UINT,          R8G8B8A8_UINT},
-
-      {PIPE_FORMAT_R8_SINT,                R8_SINT},
-      {PIPE_FORMAT_R8G8_SINT,              R8G8_SINT},
-      {PIPE_FORMAT_R8G8B8_SINT,            R8G8B8_SINT},
-      {PIPE_FORMAT_R8G8B8A8_SINT,          R8G8B8A8_SINT},
-
-      /* These formats are valid for vertex data, but should not be used
-       * for render targets.
-       */
-
-      {PIPE_FORMAT_R32_FIXED,              R32_SFIXED},
-      {PIPE_FORMAT_R32G32_FIXED,           R32G32_SFIXED},
-      {PIPE_FORMAT_R32G32B32_FIXED,        R32G32B32_SFIXED},
-      {PIPE_FORMAT_R32G32B32A32_FIXED,     R32G32B32A32_SFIXED},
-
-      {PIPE_FORMAT_R64_FLOAT,              R64_FLOAT},
-      {PIPE_FORMAT_R64G64_FLOAT,           R64G64_FLOAT},
-      {PIPE_FORMAT_R64G64B64_FLOAT,        R64G64B64_FLOAT},
-      {PIPE_FORMAT_R64G64B64A64_FLOAT,     R64G64B64A64_FLOAT},
-
-      /* These formats have entries in SWR but don't have Load/StoreTile
-       * implementations. That means these aren't renderable, and thus having
-       * a mapping entry here is detrimental.
-       */
-      /*
-
-      {PIPE_FORMAT_L8_UNORM,               L8_UNORM},
-      {PIPE_FORMAT_I8_UNORM,               I8_UNORM},
-      {PIPE_FORMAT_L8A8_UNORM,             L8A8_UNORM},
-      {PIPE_FORMAT_L16_UNORM,              L16_UNORM},
-      {PIPE_FORMAT_UYVY,                   YCRCB_SWAPUVY},
-
-      {PIPE_FORMAT_L8_SRGB,                L8_UNORM_SRGB},
-      {PIPE_FORMAT_L8A8_SRGB,              L8A8_UNORM_SRGB},
-
-      {PIPE_FORMAT_DXT1_RGBA,              BC1_UNORM},
-      {PIPE_FORMAT_DXT3_RGBA,              BC2_UNORM},
-      {PIPE_FORMAT_DXT5_RGBA,              BC3_UNORM},
-
-      {PIPE_FORMAT_DXT1_SRGBA,             BC1_UNORM_SRGB},
-      {PIPE_FORMAT_DXT3_SRGBA,             BC2_UNORM_SRGB},
-      {PIPE_FORMAT_DXT5_SRGBA,             BC3_UNORM_SRGB},
-
-      {PIPE_FORMAT_RGTC1_UNORM,            BC4_UNORM},
-      {PIPE_FORMAT_RGTC1_SNORM,            BC4_SNORM},
-      {PIPE_FORMAT_RGTC2_UNORM,            BC5_UNORM},
-      {PIPE_FORMAT_RGTC2_SNORM,            BC5_SNORM},
-
-      {PIPE_FORMAT_L16A16_UNORM,           L16A16_UNORM},
-      {PIPE_FORMAT_I16_UNORM,              I16_UNORM},
-      {PIPE_FORMAT_L16_FLOAT,              L16_FLOAT},
-      {PIPE_FORMAT_L16A16_FLOAT,           L16A16_FLOAT},
-      {PIPE_FORMAT_I16_FLOAT,              I16_FLOAT},
-      {PIPE_FORMAT_L32_FLOAT,              L32_FLOAT},
-      {PIPE_FORMAT_L32A32_FLOAT,           L32A32_FLOAT},
-      {PIPE_FORMAT_I32_FLOAT,              I32_FLOAT},
-
-      {PIPE_FORMAT_I8_UINT,                I8_UINT},
-      {PIPE_FORMAT_L8_UINT,                L8_UINT},
-      {PIPE_FORMAT_L8A8_UINT,              L8A8_UINT},
-
-      {PIPE_FORMAT_I8_SINT,                I8_SINT},
-      {PIPE_FORMAT_L8_SINT,                L8_SINT},
-      {PIPE_FORMAT_L8A8_SINT,              L8A8_SINT},
-
-      */
-   };
-
-   auto it = mesa2swr.find(format);
-   if (it == mesa2swr.end())
-      return (SWR_FORMAT)-1;
-   else
-      return it->second;
-}
-
-static bool
-swr_displaytarget_layout(struct swr_screen *screen, struct swr_resource *res)
-{
-   struct sw_winsys *winsys = screen->winsys;
-   struct sw_displaytarget *dt;
-
-   const unsigned width = align(res->swr.width, res->swr.halign);
-   const unsigned height = align(res->swr.height, res->swr.valign);
-
-   UINT stride;
-   dt = winsys->displaytarget_create(winsys,
-                                     res->base.bind,
-                                     res->base.format,
-                                     width, height,
-                                     64, NULL,
-                                     &stride);
-
-   if (dt == NULL)
-      return false;
-
-   void *map = winsys->displaytarget_map(winsys, dt, 0);
-
-   res->display_target = dt;
-   res->swr.xpBaseAddress = (gfxptr_t)map;
-
-   /* Clear the display target surface */
-   if (map)
-      memset(map, 0, height * stride);
-
-   winsys->displaytarget_unmap(winsys, dt);
-
-   return true;
-}
-
-static bool
-swr_texture_layout(struct swr_screen *screen,
-                   struct swr_resource *res,
-                   bool allocate)
-{
-   struct pipe_resource *pt = &res->base;
-
-   pipe_format fmt = pt->format;
-   const struct util_format_description *desc = util_format_description(fmt);
-
-   res->has_depth = util_format_has_depth(desc);
-   res->has_stencil = util_format_has_stencil(desc);
-
-   if (res->has_stencil && !res->has_depth)
-      fmt = PIPE_FORMAT_R8_UINT;
-
-   /* We always use the SWR layout. For 2D and 3D textures this looks like:
-    *
-    * |<------- pitch ------->|
-    * +=======================+-------
-    * |Array 0                |   ^
-    * |                       |   |
-    * |        Level 0        |   |
-    * |                       |   |
-    * |                       | qpitch
-    * +-----------+-----------+   |
-    * |           | L2L2L2L2  |   |
-    * |  Level 1  | L3L3      |   |
-    * |           | L4        |   v
-    * +===========+===========+-------
-    * |Array 1                |
-    * |                       |
-    * |        Level 0        |
-    * |                       |
-    * |                       |
-    * +-----------+-----------+
-    * |           | L2L2L2L2  |
-    * |  Level 1  | L3L3      |
-    * |           | L4        |
-    * +===========+===========+
-    *
-    * The overall width in bytes is known as the pitch, while the overall
-    * height in rows is the qpitch. Array slices are laid out logically below
-    * one another, qpitch rows apart. For 3D surfaces, the "level" values are
-    * just invalid for the higher array numbers (since depth is also
-    * minified). 1D and 1D array surfaces are stored effectively the same way,
-    * except that pitch never plays into it. All the levels are logically
-    * adjacent to each other on the X axis. The qpitch becomes the number of
-    * elements between array slices, while the pitch is unused.
-    *
-    * Each level's sizes are subject to the valign and halign settings of the
-    * surface. For compressed formats that swr is unaware of, we will use an
-    * appropriately-sized uncompressed format, and scale the widths/heights.
-    *
-    * This surface is stored inside res->swr. For depth/stencil textures,
-    * res->secondary will have an identically-laid-out but R8_UINT-formatted
-    * stencil tree. In the Z32F_S8 case, the primary surface still has 64-bpp
-    * texels, to simplify map/unmap logic which copies the stencil values
-    * in/out.
-    */
-
-   res->swr.width = pt->width0;
-   res->swr.height = pt->height0;
-   res->swr.type = swr_convert_target_type(pt->target);
-   res->swr.tileMode = SWR_TILE_NONE;
-   res->swr.format = mesa_to_swr_format(fmt);
-   res->swr.numSamples = std::max(1u, pt->nr_samples);
-
-   if (pt->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL)) {
-      res->swr.halign = KNOB_MACROTILE_X_DIM;
-      res->swr.valign = KNOB_MACROTILE_Y_DIM;
-
-      /* If SWR_MSAA_FORCE_ENABLE is set, turn on MSAA and override requested
-       * surface sample count. */
-      if (screen->msaa_force_enable) {
-         res->swr.numSamples = screen->msaa_max_count;
-         swr_print_info("swr_texture_layout: forcing sample count: %d\n",
-                 res->swr.numSamples);
-      }
-   } else {
-      res->swr.halign = 1;
-      res->swr.valign = 1;
-   }
-
-   unsigned halign = res->swr.halign * util_format_get_blockwidth(fmt);
-   unsigned width = align(pt->width0, halign);
-   if (pt->target == PIPE_TEXTURE_1D || pt->target == PIPE_TEXTURE_1D_ARRAY) {
-      for (int level = 1; level <= pt->last_level; level++)
-         width += align(u_minify(pt->width0, level), halign);
-      res->swr.pitch = util_format_get_blocksize(fmt);
-      res->swr.qpitch = util_format_get_nblocksx(fmt, width);
-   } else {
-      // The pitch is the overall width of the texture in bytes. Most of the
-      // time this is the pitch of level 0 since all the other levels fit
-      // underneath it. However in some degenerate situations, the width of
-      // level1 + level2 may be larger. In that case, we use those
-      // widths. This can happen if, e.g. halign is 32, and the width of level
-      // 0 is 32 or less. In that case, the aligned levels 1 and 2 will also
-      // be 32 each, adding up to 64.
-      unsigned valign = res->swr.valign * util_format_get_blockheight(fmt);
-      if (pt->last_level > 1) {
-         width = std::max<uint32_t>(
-               width,
-               align(u_minify(pt->width0, 1), halign) +
-               align(u_minify(pt->width0, 2), halign));
-      }
-      res->swr.pitch = util_format_get_stride(fmt, width);
-
-      // The qpitch is controlled by either the height of the second LOD, or
-      // the combination of all the later LODs.
-      unsigned height = align(pt->height0, valign);
-      if (pt->last_level == 1) {
-         height += align(u_minify(pt->height0, 1), valign);
-      } else if (pt->last_level > 1) {
-         unsigned level1 = align(u_minify(pt->height0, 1), valign);
-         unsigned level2 = 0;
-         for (int level = 2; level <= pt->last_level; level++) {
-            level2 += align(u_minify(pt->height0, level), valign);
-         }
-         height += std::max(level1, level2);
-      }
-      res->swr.qpitch = util_format_get_nblocksy(fmt, height);
-   }
-
-   if (pt->target == PIPE_TEXTURE_3D)
-      res->swr.depth = pt->depth0;
-   else
-      res->swr.depth = pt->array_size;
-
-   // Fix up swr format if necessary so that LOD offset computation works
-   if (res->swr.format == (SWR_FORMAT)-1) {
-      switch (util_format_get_blocksize(fmt)) {
-      default:
-         unreachable("Unexpected format block size");
-      case 1: res->swr.format = R8_UINT; break;
-      case 2: res->swr.format = R16_UINT; break;
-      case 4: res->swr.format = R32_UINT; break;
-      case 8:
-         if (util_format_is_compressed(fmt))
-            res->swr.format = BC4_UNORM;
-         else
-            res->swr.format = R32G32_UINT;
-         break;
-      case 16:
-         if (util_format_is_compressed(fmt))
-            res->swr.format = BC5_UNORM;
-         else
-            res->swr.format = R32G32B32A32_UINT;
-         break;
-      }
-   }
-
-   for (int level = 0; level <= pt->last_level; level++) {
-      res->mip_offsets[level] =
-         ComputeSurfaceOffset<false>(0, 0, 0, 0, 0, level, &res->swr);
-   }
-
-   size_t total_size = (uint64_t)res->swr.depth * res->swr.qpitch *
-                                 res->swr.pitch * res->swr.numSamples;
-
-   // Let non-sampled textures (e.g. buffer objects) bypass the size limit
-   if (swr_resource_is_texture(&res->base) && total_size > SWR_MAX_TEXTURE_SIZE)
-      return false;
-
-   if (allocate) {
-      res->swr.xpBaseAddress = (gfxptr_t)AlignedMalloc(total_size, 64);
-      if (!res->swr.xpBaseAddress)
-         return false;
-
-      if (res->has_depth && res->has_stencil) {
-         res->secondary = res->swr;
-         res->secondary.format = R8_UINT;
-         res->secondary.pitch = res->swr.pitch / util_format_get_blocksize(fmt);
-
-         for (int level = 0; level <= pt->last_level; level++) {
-            res->secondary_mip_offsets[level] =
-               ComputeSurfaceOffset<false>(0, 0, 0, 0, 0, level, &res->secondary);
-         }
-
-         total_size = res->secondary.depth * res->secondary.qpitch *
-                      res->secondary.pitch * res->secondary.numSamples;
-
-         res->secondary.xpBaseAddress = (gfxptr_t) AlignedMalloc(total_size, 64);
-         if (!res->secondary.xpBaseAddress) {
-            AlignedFree((void *)res->swr.xpBaseAddress);
-            return false;
-         }
-      }
-   }
-
-   return true;
-}
-
-static bool
-swr_can_create_resource(struct pipe_screen *screen,
-                        const struct pipe_resource *templat)
-{
-   struct swr_resource res;
-   memset(&res, 0, sizeof(res));
-   res.base = *templat;
-   return swr_texture_layout(swr_screen(screen), &res, false);
-}
-
-/* Helper function that conditionally creates a single-sample resolve resource
- * and attaches it to main multisample resource. */
-static bool
-swr_create_resolve_resource(struct pipe_screen *_screen,
-                            struct swr_resource *msaa_res)
-{
-   struct swr_screen *screen = swr_screen(_screen);
-
-   /* If resource is multisample, create a single-sample resolve resource */
-   if (msaa_res->base.nr_samples > 1 || (screen->msaa_force_enable &&
-            !(msaa_res->base.flags & SWR_RESOURCE_FLAG_ALT_SURFACE))) {
-
-      /* Create a single-sample copy of the resource.  Copy the original
-       * resource parameters and set flag to prevent recursion when re-calling
-       * resource_create */
-      struct pipe_resource alt_template = msaa_res->base;
-      alt_template.nr_samples = 0;
-      alt_template.flags |= SWR_RESOURCE_FLAG_ALT_SURFACE;
-
-      /* Note: Display_target is a special single-sample resource, only the
-       * display_target has been created already. */
-      if (msaa_res->base.bind & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT
-               | PIPE_BIND_SHARED)) {
-         /* Allocate the multisample buffers. */
-         if (!swr_texture_layout(screen, msaa_res, true))
-            return false;
-
-         /* Alt resource will only be bound as PIPE_BIND_RENDER_TARGET
-          * remove the DISPLAY_TARGET, SCANOUT, and SHARED bindings */
-         alt_template.bind = PIPE_BIND_RENDER_TARGET;
-      }
-
-      /* Allocate single-sample resolve surface */
-      struct pipe_resource *alt;
-      alt = _screen->resource_create(_screen, &alt_template);
-      if (!alt)
-         return false;
-
-      /* Attach it to the multisample resource */
-      msaa_res->resolve_target = alt;
-
-      /* Hang resolve surface state off the multisample surface state to so
-       * StoreTiles knows where to resolve the surface. */
-      msaa_res->swr.xpAuxBaseAddress = (gfxptr_t)&swr_resource(alt)->swr;
-   }
-
-   return true; /* success */
-}
-
-static struct pipe_resource *
-swr_resource_create(struct pipe_screen *_screen,
-                    const struct pipe_resource *templat)
-{
-   struct swr_screen *screen = swr_screen(_screen);
-   struct swr_resource *res = CALLOC_STRUCT(swr_resource);
-   if (!res)
-      return NULL;
-
-   res->base = *templat;
-   pipe_reference_init(&res->base.reference, 1);
-   res->base.screen = &screen->base;
-
-   if (swr_resource_is_texture(&res->base)) {
-      if (res->base.bind & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT
-                            | PIPE_BIND_SHARED)) {
-         /* displayable surface
-          * first call swr_texture_layout without allocating to finish
-          * filling out the SWR_SURFACE_STATE in res */
-         swr_texture_layout(screen, res, false);
-         if (!swr_displaytarget_layout(screen, res))
-            goto fail;
-      } else {
-         /* texture map */
-         if (!swr_texture_layout(screen, res, true))
-            goto fail;
-      }
-
-      /* If resource was multisample, create resolve resource and attach
-       * it to multisample resource. */
-      if (!swr_create_resolve_resource(_screen, res))
-            goto fail;
-
-   } else {
-      /* other data (vertex buffer, const buffer, etc) */
-      assert(util_format_get_blocksize(templat->format) == 1);
-      assert(templat->height0 == 1);
-      assert(templat->depth0 == 1);
-      assert(templat->last_level == 0);
-
-      /* Easiest to just call swr_texture_layout, as it sets up
-       * SWR_SURFACE_STATE in res */
-      if (!swr_texture_layout(screen, res, true))
-         goto fail;
-   }
-
-   return &res->base;
-
-fail:
-   FREE(res);
-   return NULL;
-}
-
-static void
-swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt)
-{
-   struct swr_screen *screen = swr_screen(p_screen);
-   struct swr_resource *spr = swr_resource(pt);
-
-   if (spr->display_target) {
-      /* If resource is display target, winsys manages the buffer and will
-       * free it on displaytarget_destroy. */
-      swr_fence_finish(p_screen, NULL, screen->flush_fence, 0);
-
-      struct sw_winsys *winsys = screen->winsys;
-      winsys->displaytarget_destroy(winsys, spr->display_target);
-
-      if (spr->swr.numSamples > 1) {
-         /* Free an attached resolve resource */
-         struct swr_resource *alt = swr_resource(spr->resolve_target);
-         swr_fence_work_free(screen->flush_fence, (void*)(alt->swr.xpBaseAddress), true);
-
-         /* Free multisample buffer */
-         swr_fence_work_free(screen->flush_fence, (void*)(spr->swr.xpBaseAddress), true);
-      }
-   } else {
-      /* For regular resources, defer deletion */
-      swr_resource_unused(pt);
-
-      if (spr->swr.numSamples > 1) {
-         /* Free an attached resolve resource */
-         struct swr_resource *alt = swr_resource(spr->resolve_target);
-         swr_fence_work_free(screen->flush_fence, (void*)(alt->swr.xpBaseAddress), true);
-      }
-
-      swr_fence_work_free(screen->flush_fence, (void*)(spr->swr.xpBaseAddress), true);
-      swr_fence_work_free(screen->flush_fence,
-                          (void*)(spr->secondary.xpBaseAddress), true);
-
-      /* If work queue grows too large, submit a fence to force queue to
-       * drain.  This is mainly to decrease the amount of memory used by the
-       * piglit streaming-texture-leak test */
-      if (screen->pipe && swr_fence(screen->flush_fence)->work.count > 64)
-         swr_fence_submit(swr_context(screen->pipe), screen->flush_fence);
-   }
-
-   FREE(spr);
-}
-
-
-static void
-swr_flush_frontbuffer(struct pipe_screen *p_screen,
-                      struct pipe_context *pipe,
-                      struct pipe_resource *resource,
-                      unsigned level,
-                      unsigned layer,
-                      void *context_private,
-                      struct pipe_box *sub_box)
-{
-   struct swr_screen *screen = swr_screen(p_screen);
-   struct sw_winsys *winsys = screen->winsys;
-   struct swr_resource *spr = swr_resource(resource);
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (pipe) {
-      swr_fence_finish(p_screen, NULL, screen->flush_fence, 0);
-      swr_resource_unused(resource);
-      ctx->api.pfnSwrEndFrame(ctx->swrContext);
-   }
-
-   /* Multisample resolved into resolve_target at flush with store_resource */
-   if (pipe && spr->swr.numSamples > 1) {
-      struct pipe_resource *resolve_target = spr->resolve_target;
-
-      /* Once resolved, copy into display target */
-      SWR_SURFACE_STATE *resolve = &swr_resource(resolve_target)->swr;
-
-      void *map = winsys->displaytarget_map(winsys, spr->display_target,
-                                            PIPE_MAP_WRITE);
-      memcpy(map, (void*)(resolve->xpBaseAddress), resolve->pitch * resolve->height);
-      winsys->displaytarget_unmap(winsys, spr->display_target);
-   }
-
-   debug_assert(spr->display_target);
-   if (spr->display_target)
-      winsys->displaytarget_display(
-         winsys, spr->display_target, context_private, sub_box);
-}
-
-
-void
-swr_destroy_screen_internal(struct swr_screen **screen)
-{
-   struct pipe_screen *p_screen = &(*screen)->base;
-
-   swr_fence_finish(p_screen, NULL, (*screen)->flush_fence, 0);
-   swr_fence_reference(p_screen, &(*screen)->flush_fence, NULL);
-
-   JitDestroyContext((*screen)->hJitMgr);
-
-   if ((*screen)->pLibrary)
-      util_dl_close((*screen)->pLibrary);
-
-   FREE(*screen);
-   *screen = NULL;
-}
-
-
-static void
-swr_destroy_screen(struct pipe_screen *p_screen)
-{
-   struct swr_screen *screen = swr_screen(p_screen);
-   struct sw_winsys *winsys = screen->winsys;
-
-   swr_print_info("SWR destroy screen!\n");
-
-   if (winsys->destroy)
-      winsys->destroy(winsys);
-
-   swr_destroy_screen_internal(&screen);
-}
-
-
-static void
-swr_validate_env_options(struct swr_screen *screen)
-{
-   /* The client_copy_limit sets a maximum on the amount of user-buffer memory
-    * copied to scratch space on a draw.  Past this, the draw will access
-    * user-buffer directly and then block.  This is faster than queuing many
-    * large client draws. */
-   screen->client_copy_limit = SWR_CLIENT_COPY_LIMIT;
-   int client_copy_limit =
-      debug_get_num_option("SWR_CLIENT_COPY_LIMIT", SWR_CLIENT_COPY_LIMIT);
-   if (client_copy_limit > 0)
-      screen->client_copy_limit = client_copy_limit;
-
-   /* XXX msaa under development, disable by default for now */
-   screen->msaa_max_count = 1; /* was SWR_MAX_NUM_MULTISAMPLES; */
-
-   /* validate env override values, within range and power of 2 */
-   int msaa_max_count = debug_get_num_option("SWR_MSAA_MAX_COUNT", 1);
-   if (msaa_max_count != 1) {
-      if ((msaa_max_count < 1) || (msaa_max_count > SWR_MAX_NUM_MULTISAMPLES)
-            || !util_is_power_of_two_or_zero(msaa_max_count)) {
-         fprintf(stderr, "SWR_MSAA_MAX_COUNT invalid: %d\n", msaa_max_count);
-         fprintf(stderr, "must be power of 2 between 1 and %d" \
-                         " (or 1 to disable msaa)\n",
-               SWR_MAX_NUM_MULTISAMPLES);
-         fprintf(stderr, "(msaa disabled)\n");
-         msaa_max_count = 1;
-      }
-
-      swr_print_info("SWR_MSAA_MAX_COUNT: %d\n", msaa_max_count);
-
-      screen->msaa_max_count = msaa_max_count;
-   }
-
-   screen->msaa_force_enable = debug_get_bool_option(
-         "SWR_MSAA_FORCE_ENABLE", false);
-   if (screen->msaa_force_enable)
-      swr_print_info("SWR_MSAA_FORCE_ENABLE: true\n");
-}
-
-
-struct pipe_screen *
-swr_create_screen_internal(struct sw_winsys *winsys)
-{
-   struct swr_screen *screen = CALLOC_STRUCT(swr_screen);
-
-   if (!screen)
-      return NULL;
-
-   if (!lp_build_init()) {
-      FREE(screen);
-      return NULL;
-   }
-
-   screen->winsys = winsys;
-   screen->base.get_name = swr_get_name;
-   screen->base.get_vendor = swr_get_vendor;
-   screen->base.is_format_supported = swr_is_format_supported;
-   screen->base.context_create = swr_create_context;
-   screen->base.can_create_resource = swr_can_create_resource;
-
-   screen->base.destroy = swr_destroy_screen;
-   screen->base.get_param = swr_get_param;
-   screen->base.get_shader_param = swr_get_shader_param;
-   screen->base.get_paramf = swr_get_paramf;
-
-   screen->base.resource_create = swr_resource_create;
-   screen->base.resource_destroy = swr_resource_destroy;
-
-   screen->base.flush_frontbuffer = swr_flush_frontbuffer;
-
-   // Pass in "" for architecture for run-time determination
-   screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, "", "swr");
-
-   swr_fence_init(&screen->base);
-
-   swr_validate_env_options(screen);
-
-   return &screen->base;
-}
diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h
deleted file mode 100644
index e66f5443357..00000000000
--- a/src/gallium/drivers/swr/swr_screen.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#ifndef SWR_SCREEN_H
-#define SWR_SCREEN_H
-
-#include "swr_resource.h"
-
-#include "pipe/p_screen.h"
-#include "pipe/p_defines.h"
-#include "util/u_dl.h"
-#include "util/format/u_format.h"
-#include "api.h"
-
-#include "memory/TilingFunctions.h"
-#include "memory/InitMemory.h"
-#include <stdio.h>
-#include <stdarg.h>
-
-struct sw_winsys;
-
-struct swr_screen {
-   struct pipe_screen base;
-   struct pipe_context *pipe;
-
-   struct pipe_fence_handle *flush_fence;
-
-   struct sw_winsys *winsys;
-
-   /* Configurable environment settings */
-   bool msaa_force_enable;
-   uint8_t msaa_max_count;
-   uint32_t client_copy_limit;
-
-   HANDLE hJitMgr;
-
-   /* Dynamic backend implementations */
-   util_dl_library *pLibrary;
-   PFNSwrGetInterface pfnSwrGetInterface;
-   PFNSwrGetTileInterface pfnSwrGetTileInterface;
-
-   /* Do we run on Xeon Phi? */
-   bool is_knl;
-};
-
-static INLINE struct swr_screen *
-swr_screen(struct pipe_screen *pipe)
-{
-   return (struct swr_screen *)pipe;
-}
-
-SWR_FORMAT
-mesa_to_swr_format(enum pipe_format format);
-
-INLINE void swr_print_info(const char *format, ...)
-{
-   static bool print_info = debug_get_bool_option("SWR_PRINT_INFO", false);
-   if(print_info) {
-      va_list args;
-      va_start(args, format);
-      vfprintf(stderr, format, args);
-      va_end(args);
-   }
-}
-
-#endif
diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp
deleted file mode 100644
index 315036920fb..00000000000
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ /dev/null
@@ -1,3040 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include <llvm/Config/llvm-config.h>
-
-#if LLVM_VERSION_MAJOR < 7
-// llvm redefines DEBUG
-#pragma push_macro("DEBUG")
-#undef DEBUG
-#endif
-
-#include "JitManager.h"
-#include "llvm-c/Core.h"
-#include "llvm/Support/CBindingWrapping.h"
-#include "llvm/IR/LegacyPassManager.h"
-
-#if LLVM_VERSION_MAJOR < 7
-#pragma pop_macro("DEBUG")
-#endif
-
-#include "state.h"
-#include "gen_state_llvm.h"
-#include "builder.h"
-#include "functionpasses/passes.h"
-
-#include "tgsi/tgsi_strings.h"
-#include "util/format/u_format.h"
-#include "util/u_prim.h"
-#include "gallivm/lp_bld_init.h"
-#include "gallivm/lp_bld_flow.h"
-#include "gallivm/lp_bld_struct.h"
-#include "gallivm/lp_bld_tgsi.h"
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_printf.h"
-#include "gallivm/lp_bld_logic.h"
-
-#include "swr_context.h"
-#include "gen_surf_state_llvm.h"
-#include "gen_swr_context_llvm.h"
-#include "swr_resource.h"
-#include "swr_state.h"
-#include "swr_screen.h"
-
-
-/////////////////////////////////////////////////////////////////////////
-
-#include <stdio.h>
-#include <inttypes.h>
-
-#include "util/u_debug.h"
-#include "util/u_memory.h"
-#include "util/u_string.h"
-
-#include "gallivm/lp_bld_type.h"
-
-#if defined(DEBUG) && defined(SWR_VERBOSE_SHADER)
-constexpr bool verbose_shader          = true;
-constexpr bool verbose_tcs_shader_in   = true;
-constexpr bool verbose_tcs_shader_out  = true;
-constexpr bool verbose_tcs_shader_loop = true;
-constexpr bool verbose_vs_shader       = true;
-#else
-constexpr bool verbose_shader          = false;
-constexpr bool verbose_tcs_shader_in   = false;
-constexpr bool verbose_tcs_shader_out  = false;
-constexpr bool verbose_tcs_shader_loop = false;
-constexpr bool verbose_vs_shader       = false;
-#endif
-
-using namespace SwrJit;
-
-static unsigned
-locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info);
-
-bool operator==(const swr_jit_fs_key &lhs, const swr_jit_fs_key &rhs)
-{
-   return !memcmp(&lhs, &rhs, sizeof(lhs));
-}
-
-bool operator==(const swr_jit_vs_key &lhs, const swr_jit_vs_key &rhs)
-{
-   return !memcmp(&lhs, &rhs, sizeof(lhs));
-}
-
-bool operator==(const swr_jit_fetch_key &lhs, const swr_jit_fetch_key &rhs)
-{
-   return !memcmp(&lhs, &rhs, sizeof(lhs));
-}
-
-bool operator==(const swr_jit_gs_key &lhs, const swr_jit_gs_key &rhs)
-{
-   return !memcmp(&lhs, &rhs, sizeof(lhs));
-}
-
-bool operator==(const swr_jit_tcs_key &lhs, const swr_jit_tcs_key &rhs)
-{
-   return !memcmp(&lhs, &rhs, sizeof(lhs));
-}
-
-bool operator==(const swr_jit_tes_key &lhs, const swr_jit_tes_key &rhs)
-{
-   return !memcmp(&lhs, &rhs, sizeof(lhs));
-}
-
-
-static void
-swr_generate_sampler_key(const struct lp_tgsi_info &info,
-                         struct swr_context *ctx,
-                         enum pipe_shader_type shader_type,
-                         struct swr_jit_sampler_key &key)
-{
-   key.nr_samplers = info.base.file_max[TGSI_FILE_SAMPLER] + 1;
-
-   for (unsigned i = 0; i < key.nr_samplers; i++) {
-      if (info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
-         lp_sampler_static_sampler_state(
-            &key.sampler[i].sampler_state,
-            ctx->samplers[shader_type][i]);
-      }
-   }
-
-   /*
-    * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
-    * are dx10-style? Can't really have mixed opcodes, at least not
-    * if we want to skip the holes here (without rescanning tgsi).
-    */
-   if (info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
-      key.nr_sampler_views =
-         info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
-      for (unsigned i = 0; i < key.nr_sampler_views; i++) {
-         if (info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1u << (i & 31))) {
-            const struct pipe_sampler_view *view =
-               ctx->sampler_views[shader_type][i];
-            lp_sampler_static_texture_state(
-               &key.sampler[i].texture_state, view);
-            if (view) {
-               struct swr_resource *swr_res = swr_resource(view->texture);
-               const struct util_format_description *desc =
-                  util_format_description(view->format);
-               if (swr_res->has_depth && swr_res->has_stencil &&
-                   !util_format_has_depth(desc))
-                  key.sampler[i].texture_state.format = PIPE_FORMAT_S8_UINT;
-            }
-         }
-      }
-   } else {
-      key.nr_sampler_views = key.nr_samplers;
-      for (unsigned i = 0; i < key.nr_sampler_views; i++) {
-         if (info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
-            const struct pipe_sampler_view *view =
-               ctx->sampler_views[shader_type][i];
-            lp_sampler_static_texture_state(
-               &key.sampler[i].texture_state, view);
-            if (view) {
-               struct swr_resource *swr_res = swr_resource(view->texture);
-               const struct util_format_description *desc =
-                  util_format_description(view->format);
-               if (swr_res->has_depth && swr_res->has_stencil &&
-                   !util_format_has_depth(desc))
-                  key.sampler[i].texture_state.format = PIPE_FORMAT_S8_UINT;
-            }
-         }
-      }
-   }
-}
-
-void
-swr_generate_fs_key(struct swr_jit_fs_key &key,
-                    struct swr_context *ctx,
-                    swr_fragment_shader *swr_fs)
-{
-   memset((void*)&key, 0, sizeof(key));
-
-   key.nr_cbufs = ctx->framebuffer.nr_cbufs;
-   key.light_twoside = ctx->rasterizer->light_twoside;
-   key.sprite_coord_enable = ctx->rasterizer->sprite_coord_enable;
-
-   struct tgsi_shader_info *pPrevShader;
-   if (ctx->gs)
-      pPrevShader = &ctx->gs->info.base;
-   else if (ctx->tes)
-      pPrevShader = &ctx->tes->info.base;
-   else
-      pPrevShader = &ctx->vs->info.base;
-
-   memcpy(&key.vs_output_semantic_name,
-          &pPrevShader->output_semantic_name,
-          sizeof(key.vs_output_semantic_name));
-   memcpy(&key.vs_output_semantic_idx,
-          &pPrevShader->output_semantic_index,
-          sizeof(key.vs_output_semantic_idx));
-
-   swr_generate_sampler_key(swr_fs->info, ctx, PIPE_SHADER_FRAGMENT, key);
-
-   key.poly_stipple_enable = ctx->rasterizer->poly_stipple_enable &&
-      ctx->poly_stipple.prim_is_poly;
-}
-
-void
-swr_generate_vs_key(struct swr_jit_vs_key &key,
-                    struct swr_context *ctx,
-                    swr_vertex_shader *swr_vs)
-{
-   memset((void*)&key, 0, sizeof(key));
-
-   key.clip_plane_mask =
-      swr_vs->info.base.clipdist_writemask ?
-      swr_vs->info.base.clipdist_writemask & ctx->rasterizer->clip_plane_enable :
-      ctx->rasterizer->clip_plane_enable;
-
-   swr_generate_sampler_key(swr_vs->info, ctx, PIPE_SHADER_VERTEX, key);
-}
-
-void
-swr_generate_fetch_key(struct swr_jit_fetch_key &key,
-                       struct swr_vertex_element_state *velems)
-{
-   memset((void*)&key, 0, sizeof(key));
-
-   key.fsState = velems->fsState;
-}
-
-void
-swr_generate_gs_key(struct swr_jit_gs_key &key,
-                    struct swr_context *ctx,
-                    swr_geometry_shader *swr_gs)
-{
-   memset((void*)&key, 0, sizeof(key));
-
-   struct tgsi_shader_info *pPrevShader = nullptr;
-
-   if (ctx->tes) {
-      pPrevShader = &ctx->tes->info.base;
-   } else {
-      pPrevShader = &ctx->vs->info.base;
-   }
-
-   memcpy(&key.vs_output_semantic_name,
-          &pPrevShader->output_semantic_name,
-          sizeof(key.vs_output_semantic_name));
-   memcpy(&key.vs_output_semantic_idx,
-          &pPrevShader->output_semantic_index,
-          sizeof(key.vs_output_semantic_idx));
-
-   swr_generate_sampler_key(swr_gs->info, ctx, PIPE_SHADER_GEOMETRY, key);
-}
-
-void
-swr_generate_tcs_key(struct swr_jit_tcs_key &key,
-                    struct swr_context *ctx,
-                    swr_tess_control_shader *swr_tcs)
-{
-   memset((void*)&key, 0, sizeof(key));
-
-   struct tgsi_shader_info *pPrevShader = &ctx->vs->info.base;
-
-   memcpy(&key.vs_output_semantic_name,
-          &pPrevShader->output_semantic_name,
-          sizeof(key.vs_output_semantic_name));
-   memcpy(&key.vs_output_semantic_idx,
-          &pPrevShader->output_semantic_index,
-          sizeof(key.vs_output_semantic_idx));
-
-   key.clip_plane_mask =
-      swr_tcs->info.base.clipdist_writemask ?
-      swr_tcs->info.base.clipdist_writemask & ctx->rasterizer->clip_plane_enable :
-      ctx->rasterizer->clip_plane_enable;
-
-   swr_generate_sampler_key(swr_tcs->info, ctx, PIPE_SHADER_TESS_CTRL, key);
-}
-
-void
-swr_generate_tes_key(struct swr_jit_tes_key &key,
-                    struct swr_context *ctx,
-                    swr_tess_evaluation_shader *swr_tes)
-{
-   memset((void*)&key, 0, sizeof(key));
-
-   struct tgsi_shader_info *pPrevShader = nullptr;
-
-   if (ctx->tcs) {
-      pPrevShader = &ctx->tcs->info.base;
-   }
-   else {
-      pPrevShader = &ctx->vs->info.base;
-   }
-
-   SWR_ASSERT(pPrevShader != nullptr, "TES: No TCS or VS defined");
-
-   memcpy(&key.prev_output_semantic_name,
-         &pPrevShader->output_semantic_name,
-         sizeof(key.prev_output_semantic_name));
-   memcpy(&key.prev_output_semantic_idx,
-         &pPrevShader->output_semantic_index,
-         sizeof(key.prev_output_semantic_idx));
-
-   key.clip_plane_mask =
-      swr_tes->info.base.clipdist_writemask ?
-      swr_tes->info.base.clipdist_writemask & ctx->rasterizer->clip_plane_enable :
-      ctx->rasterizer->clip_plane_enable;
-
-   swr_generate_sampler_key(swr_tes->info, ctx, PIPE_SHADER_TESS_EVAL, key);
-}
-
-struct BuilderSWR : public Builder {
-   BuilderSWR(JitManager *pJitMgr, const char *pName)
-      : Builder(pJitMgr)
-   {
-      pJitMgr->SetupNewModule();
-      gallivm = gallivm_create(pName, wrap(&JM()->mContext), NULL);
-      pJitMgr->mpCurrentModule = unwrap(gallivm->module);
-   }
-
-   ~BuilderSWR() {
-      gallivm_free_ir(gallivm);
-   }
-
-   void WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput,
-                unsigned slot, unsigned channel);
-
-   struct gallivm_state *gallivm;
-   PFN_VERTEX_FUNC CompileVS(struct swr_context *ctx, swr_jit_vs_key &key);
-   PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_fs_key &key);
-   PFN_GS_FUNC CompileGS(struct swr_context *ctx, swr_jit_gs_key &key);
-   PFN_TCS_FUNC CompileTCS(struct swr_context *ctx, swr_jit_tcs_key &key);
-   PFN_TES_FUNC CompileTES(struct swr_context *ctx, swr_jit_tes_key &key);
-
-   // GS-specific emit functions
-   LLVMValueRef
-   swr_gs_llvm_fetch_input(const struct lp_build_gs_iface *gs_iface,
-                           struct lp_build_context * bld,
-                           boolean is_vindex_indirect,
-                           LLVMValueRef vertex_index,
-                           boolean is_aindex_indirect,
-                           LLVMValueRef attrib_index,
-                           LLVMValueRef swizzle_index);
-   void
-   swr_gs_llvm_emit_vertex(const struct lp_build_gs_iface *gs_base,
-                           struct lp_build_context * bld,
-                           LLVMValueRef (*outputs)[4],
-                           LLVMValueRef emitted_vertices_vec,
-                           LLVMValueRef stream_id);
-
-   void
-   swr_gs_llvm_end_primitive(const struct lp_build_gs_iface *gs_base,
-                             struct lp_build_context * bld,
-                             LLVMValueRef total_emitted_vertices_vec_ptr,
-                             LLVMValueRef verts_per_prim_vec,
-                             LLVMValueRef emitted_prims_vec,
-                             LLVMValueRef mask_vec);
-
-   void
-   swr_gs_llvm_epilogue(const struct lp_build_gs_iface *gs_base,
-                        LLVMValueRef total_emitted_vertices_vec,
-                        LLVMValueRef emitted_prims_vec, unsigned stream);
-
-   // TCS-specific emit functions
-   void swr_tcs_llvm_emit_prologue(struct lp_build_tgsi_soa_context* bld);
-   void swr_tcs_llvm_emit_epilogue(struct lp_build_tgsi_soa_context* bld);
-
-   LLVMValueRef
-   swr_tcs_llvm_fetch_input(const struct lp_build_tcs_iface *tcs_iface,
-                            struct lp_build_tgsi_context * bld_base,
-                            boolean is_vindex_indirect,
-                            LLVMValueRef vertex_index,
-                            boolean is_aindex_indirect,
-                            LLVMValueRef attrib_index,
-                            LLVMValueRef swizzle_index);
-
-   LLVMValueRef
-   swr_tcs_llvm_fetch_output(const struct lp_build_tcs_iface *tcs_iface,
-                             struct lp_build_tgsi_context * bld_base,
-                             boolean is_vindex_indirect,
-                             LLVMValueRef vertex_index,
-                             boolean is_aindex_indirect,
-                             LLVMValueRef attrib_index,
-                             LLVMValueRef swizzle_index,
-                             uint32_t name);
-
-   void
-   swr_tcs_llvm_store_output(const struct lp_build_tcs_iface *tcs_iface,
-                            struct lp_build_tgsi_context * bld_base,
-                            unsigned name,
-                            boolean is_vindex_indirect,
-                            LLVMValueRef vertex_index,
-                            boolean is_aindex_indirect,
-                            LLVMValueRef attrib_index,
-                            LLVMValueRef swizzle_index,
-                            LLVMValueRef value,
-                            LLVMValueRef mask_vec);
-
-   // Barrier implementation (available only in TCS)
-   void
-   swr_tcs_llvm_emit_barrier(const struct lp_build_tcs_iface *tcs_iface,
-                             struct lp_build_tgsi_context *bld_base);
-
-   // TES-specific emit functions
-   LLVMValueRef
-   swr_tes_llvm_fetch_vtx_input(const struct lp_build_tes_iface *tes_iface,
-                            struct lp_build_tgsi_context * bld_base,
-                            boolean is_vindex_indirect,
-                            LLVMValueRef vertex_index,
-                            boolean is_aindex_indirect,
-                            LLVMValueRef attrib_index,
-                            LLVMValueRef swizzle_index);
-
-   LLVMValueRef
-   swr_tes_llvm_fetch_patch_input(const struct lp_build_tes_iface *tes_iface,
-                            struct lp_build_tgsi_context * bld_base,
-                            boolean is_aindex_indirect,
-                            LLVMValueRef attrib_index,
-                            LLVMValueRef swizzle_index);
-};
-
-struct swr_gs_llvm_iface {
-   struct lp_build_gs_iface base;
-   struct tgsi_shader_info *info;
-
-   BuilderSWR *pBuilder;
-
-   Value *pGsCtx;
-   SWR_GS_STATE *pGsState;
-   uint32_t num_outputs;
-   uint32_t num_verts_per_prim;
-
-   Value *pVtxAttribMap;
-};
-
-struct swr_tcs_llvm_iface {
-   struct lp_build_tcs_iface base;
-   struct tgsi_shader_info *info;
-
-   BuilderSWR *pBuilder;
-
-   Value *pTcsCtx;
-   SWR_TS_STATE *pTsState;
-
-   uint32_t output_vertices;
-
-   LLVMValueRef loop_var;
-
-   Value *pVtxAttribMap;
-   Value *pVtxOutputAttribMap;
-   Value *pPatchOutputAttribMap;
-};
-
-struct swr_tes_llvm_iface {
-   struct lp_build_tes_iface base;
-   struct tgsi_shader_info *info;
-
-   BuilderSWR *pBuilder;
-
-   Value *pTesCtx;
-   SWR_TS_STATE *pTsState;
-
-   uint32_t num_outputs;
-
-   Value *pVtxAttribMap;
-   Value *pPatchAttribMap;
-};
-
-// trampoline functions so we can use the builder llvm construction methods
-static LLVMValueRef
-swr_gs_llvm_fetch_input(const struct lp_build_gs_iface *gs_iface,
-                           struct lp_build_context * bld,
-                           boolean is_vindex_indirect,
-                           LLVMValueRef vertex_index,
-                           boolean is_aindex_indirect,
-                           LLVMValueRef attrib_index,
-                           LLVMValueRef swizzle_index)
-{
-    swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_iface;
-
-    return iface->pBuilder->swr_gs_llvm_fetch_input(gs_iface, bld,
-                                                   is_vindex_indirect,
-                                                   vertex_index,
-                                                   is_aindex_indirect,
-                                                   attrib_index,
-                                                   swizzle_index);
-}
-
-static void
-swr_gs_llvm_emit_vertex(const struct lp_build_gs_iface *gs_base,
-                           struct lp_build_context * bld,
-                           LLVMValueRef (*outputs)[4],
-                           LLVMValueRef emitted_vertices_vec,
-                           LLVMValueRef mask_vec,
-                           LLVMValueRef stream_id)
-{
-    swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
-
-    iface->pBuilder->swr_gs_llvm_emit_vertex(gs_base, bld,
-                                            outputs,
-                                            emitted_vertices_vec,
-                                            stream_id);
-}
-
-static void
-swr_gs_llvm_end_primitive(const struct lp_build_gs_iface *gs_base,
-                             struct lp_build_context * bld,
-                             LLVMValueRef total_emitted_vertices_vec_ptr,
-                             LLVMValueRef verts_per_prim_vec,
-                             LLVMValueRef emitted_prims_vec,
-                             LLVMValueRef mask_vec, unsigned stream_id)
-{
-    swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
-
-    iface->pBuilder->swr_gs_llvm_end_primitive(gs_base, bld,
-                                              total_emitted_vertices_vec_ptr,
-                                              verts_per_prim_vec,
-                                              emitted_prims_vec,
-                                              mask_vec);
-}
-
-static void
-swr_gs_llvm_epilogue(const struct lp_build_gs_iface *gs_base,
-                        LLVMValueRef total_emitted_vertices_vec,
-                        LLVMValueRef emitted_prims_vec, unsigned stream)
-{
-    swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
-
-    iface->pBuilder->swr_gs_llvm_epilogue(gs_base,
-                                         total_emitted_vertices_vec,
-                                         emitted_prims_vec, stream);
-}
-
-static LLVMValueRef
-swr_tcs_llvm_fetch_input(const struct lp_build_tcs_iface *tcs_iface,
-                         struct lp_build_context * bld,
-                         boolean is_vindex_indirect,
-                         LLVMValueRef vertex_index,
-                         boolean is_aindex_indirect,
-                         LLVMValueRef attrib_index,
-                         boolean is_sindex_indirect,
-                         LLVMValueRef swizzle_index)
-{
-    swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface;
-    struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld;
-
-    return iface->pBuilder->swr_tcs_llvm_fetch_input(tcs_iface, bld_base,
-                                                     is_vindex_indirect,
-                                                     vertex_index,
-                                                     is_aindex_indirect,
-                                                     attrib_index,
-                                                     swizzle_index);
-}
-
-static LLVMValueRef
-swr_tcs_llvm_fetch_output(const struct lp_build_tcs_iface *tcs_iface,
-                          struct lp_build_context * bld,
-                          boolean is_vindex_indirect,
-                          LLVMValueRef vertex_index,
-                          boolean is_aindex_indirect,
-                          LLVMValueRef attrib_index,
-                          boolean is_sindex_indirect,
-                          LLVMValueRef swizzle_index,
-                          uint32_t name)
-{
-    swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface;
-    struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld;
-
-    return iface->pBuilder->swr_tcs_llvm_fetch_output(tcs_iface, bld_base,
-                                                      is_vindex_indirect,
-                                                      vertex_index,
-                                                      is_aindex_indirect,
-                                                      attrib_index,
-                                                      swizzle_index,
-                                                      name);
-}
-
-
-static void
-swr_tcs_llvm_emit_prologue(struct lp_build_context* bld)
-{
-   lp_build_tgsi_soa_context* bld_base = (lp_build_tgsi_soa_context*)bld;
-   swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld_base->tcs_iface;
-   iface->pBuilder->swr_tcs_llvm_emit_prologue(bld_base);
-}
-
-static void
-swr_tcs_llvm_emit_epilogue(struct lp_build_context* bld)
-{
-   lp_build_tgsi_soa_context* bld_base = (lp_build_tgsi_soa_context*)bld;
-   swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld_base->tcs_iface;
-   iface->pBuilder->swr_tcs_llvm_emit_epilogue(bld_base);
-}
-
-static
-void swr_tcs_llvm_store_output(const struct lp_build_tcs_iface *tcs_iface,
-                         struct lp_build_context * bld,
-                         unsigned name,
-                         boolean is_vindex_indirect,
-                         LLVMValueRef vertex_index,
-                         boolean is_aindex_indirect,
-                         LLVMValueRef attrib_index,
-                         boolean is_sindex_indirect,
-                         LLVMValueRef swizzle_index,
-                         LLVMValueRef value,
-                         LLVMValueRef mask_vec)
-{
-    swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface;
-    struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld;
-
-    iface->pBuilder->swr_tcs_llvm_store_output(tcs_iface,
-                                               bld_base,
-                                               name,
-                                               is_vindex_indirect,
-                                               vertex_index,
-                                               is_aindex_indirect,
-                                               attrib_index,
-                                               swizzle_index,
-                                               value,
-                                               mask_vec);
-}
-
-
-static
-void swr_tcs_llvm_emit_barrier(struct lp_build_context *bld)
-{
-   lp_build_tgsi_soa_context* bld_base = (lp_build_tgsi_soa_context*)bld;
-   swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld_base->tcs_iface;
-
-   iface->pBuilder->swr_tcs_llvm_emit_barrier(bld_base->tcs_iface, &bld_base->bld_base);
-}
-
-
-static LLVMValueRef
-swr_tes_llvm_fetch_vtx_input(const struct lp_build_tes_iface *tes_iface,
-                             struct lp_build_context * bld,
-                             boolean is_vindex_indirect,
-                             LLVMValueRef vertex_index,
-                             boolean is_aindex_indirect,
-                             LLVMValueRef attrib_index,
-                             boolean is_sindex_indirect,
-                             LLVMValueRef swizzle_index)
-{
-    swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface;
-    struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld;
-
-    return iface->pBuilder->swr_tes_llvm_fetch_vtx_input(tes_iface, bld_base,
-                                                     is_vindex_indirect,
-                                                     vertex_index,
-                                                     is_aindex_indirect,
-                                                     attrib_index,
-                                                     swizzle_index);
-}
-
-static LLVMValueRef
-swr_tes_llvm_fetch_patch_input(const struct lp_build_tes_iface *tes_iface,
-                               struct lp_build_context * bld,
-                               boolean is_aindex_indirect,
-                               LLVMValueRef attrib_index,
-                               LLVMValueRef swizzle_index)
-{
-    swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface;
-    struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld;
-
-    return iface->pBuilder->swr_tes_llvm_fetch_patch_input(tes_iface, bld_base,
-                                                     is_aindex_indirect,
-                                                     attrib_index,
-                                                     swizzle_index);
-}
-
-LLVMValueRef
-BuilderSWR::swr_gs_llvm_fetch_input(const struct lp_build_gs_iface *gs_iface,
-                           struct lp_build_context * bld,
-                           boolean is_vindex_indirect,
-                           LLVMValueRef vertex_index,
-                           boolean is_aindex_indirect,
-                           LLVMValueRef attrib_index,
-                           LLVMValueRef swizzle_index)
-{
-    swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_iface;
-    Value *vert_index = unwrap(vertex_index);
-    Value *attr_index = unwrap(attrib_index);
-
-    IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-    if (is_vindex_indirect || is_aindex_indirect) {
-       int i;
-       Value *res = unwrap(bld->zero);
-       struct lp_type type = bld->type;
-
-       for (i = 0; i < type.length; i++) {
-          Value *vert_chan_index = vert_index;
-          Value *attr_chan_index = attr_index;
-
-          if (is_vindex_indirect) {
-             vert_chan_index = VEXTRACT(vert_index, C(i));
-          }
-          if (is_aindex_indirect) {
-             attr_chan_index = VEXTRACT(attr_index, C(i));
-          }
-
-          Value *attrib =
-             LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_chan_index}));
-
-          Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts});
-          Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride});
-
-          Value *pVector = ADD(MUL(vert_chan_index, pInputVertStride), attrib);
-          Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)}));
-
-          Value *value = VEXTRACT(pInput, C(i));
-          res = VINSERT(res, value, C(i));
-       }
-
-       return wrap(res);
-    } else {
-       Value *attrib = LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_index}));
-
-       Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts});
-       Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride});
-
-       Value *pVector = ADD(MUL(vert_index, pInputVertStride), attrib);
-
-       Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)}));
-
-       return wrap(pInput);
-    }
-}
-
-// GS output stream layout
-#define VERTEX_COUNT_SIZE 32
-#define CONTROL_HEADER_SIZE (8*32)
-
-void
-BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_gs_iface *gs_base,
-                           struct lp_build_context * bld,
-                           LLVMValueRef (*outputs)[4],
-                           LLVMValueRef emitted_vertices_vec,
-                           LLVMValueRef stream_id)
-{
-    swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
-
-    IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-    const uint32_t headerSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE;
-    const uint32_t attribSize = 4 * sizeof(float);
-    const uint32_t vertSize = attribSize * SWR_VTX_NUM_SLOTS;
-    Value *pVertexOffset = MUL(unwrap(emitted_vertices_vec), VIMMED1(vertSize));
-
-    Value *vMask = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_mask});
-    Value *vMask1 = TRUNC(vMask, getVectorType(mInt1Ty, mVWidth));
-
-    Value *pStack = STACKSAVE();
-    Value *pTmpPtr = ALLOCA(mFP32Ty, C(4)); // used for dummy write for lane masking
-
-    for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) {
-       uint32_t attribSlot = attrib;
-       uint32_t sgvChannel = 0;
-       if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) {
-          attribSlot = VERTEX_SGV_SLOT;
-          sgvChannel = VERTEX_SGV_POINT_SIZE_COMP;
-       } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER) {
-          attribSlot = VERTEX_SGV_SLOT;
-          sgvChannel = VERTEX_SGV_RTAI_COMP;
-       } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_VIEWPORT_INDEX) {
-          attribSlot = VERTEX_SGV_SLOT;
-          sgvChannel = VERTEX_SGV_VAI_COMP;
-       } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_POSITION) {
-          attribSlot = VERTEX_POSITION_SLOT;
-       } else {
-          attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
-          if (iface->info->writes_position) {
-             attribSlot--;
-          }
-       }
-
-       Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + attribSize * attribSlot)); // + sgvChannel ?
-
-       for (uint32_t lane = 0; lane < mVWidth; ++lane) {
-          Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane));
-          Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
-          Value *pStreamOffset = GEP(pStream, pLaneOffset);
-          pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy);
-
-          Value *pLaneMask = VEXTRACT(vMask1, C(lane));
-          pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);
-
-          for (uint32_t channel = 0; channel < 4; ++channel) {
-             Value *vData;
-
-             if (attribSlot == VERTEX_SGV_SLOT)
-                vData = LOAD(unwrap(outputs[attrib][0]));
-             else
-                vData = LOAD(unwrap(outputs[attrib][channel]));
-
-             if (attribSlot != VERTEX_SGV_SLOT ||
-                 sgvChannel == channel) {
-                vData = VEXTRACT(vData, C(lane));
-                STORE(vData, pStreamOffset);
-             }
-             pStreamOffset = GEP(pStreamOffset, C(1));
-          }
-       }
-    }
-
-    /* When the output type is not points, the geometry shader may not
-     * output data to multiple streams. So early exit here.
-     */
-    if(iface->pGsState->outputTopology != TOP_POINT_LIST) {
-        STACKRESTORE(pStack);
-        return;
-    }
-
-    // Info about stream id for each vertex
-    // is coded in 2 bits (4 vert per byte "box"):
-    // ----------------- ----------------- ----
-    // |d|d|c|c|b|b|a|a| |h|h|g|g|f|f|e|e| |...
-    // ----------------- ----------------- ----
-
-    // Calculate where need to put stream id for current vert
-    // in 1 byte "box".
-    Value *pShiftControl = MUL(unwrap(emitted_vertices_vec), VIMMED1(2));
-
-    // Calculate in which box put stream id for current vert.
-    Value *pOffsetControl = LSHR(unwrap(emitted_vertices_vec), VIMMED1(2));
-
-    // Skip count header
-    Value *pStreamIdOffset = ADD(pOffsetControl, VIMMED1(VERTEX_COUNT_SIZE));
-
-    for (uint32_t lane = 0; lane < mVWidth; ++lane) {
-       Value *pShift = TRUNC(VEXTRACT(pShiftControl, C(lane)), mInt8Ty);
-       Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
-
-       Value *pStreamOffset = GEP(pStream, VEXTRACT(pStreamIdOffset, C(lane)));
-
-       // Just make sure that not overflow max - stream id = (0,1,2,3)
-       Value *vVal = TRUNC(AND(VEXTRACT(unwrap(stream_id), C(0)), C(0x3)), mInt8Ty);
-
-       // Shift it to correct position in byte "box"
-       vVal = SHL(vVal, pShift);
-
-       // Info about other vertices can be already stored
-       // so we need to read and add bits from current vert info.
-       Value *storedValue = LOAD(pStreamOffset);
-       vVal = OR(storedValue, vVal);
-       STORE(vVal, pStreamOffset);
-    }
-
-    STACKRESTORE(pStack);
-}
-
-void
-BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_gs_iface *gs_base,
-                             struct lp_build_context * bld,
-                             LLVMValueRef total_emitted_vertices_vec,
-                             LLVMValueRef verts_per_prim_vec,
-                             LLVMValueRef emitted_prims_vec,
-                             LLVMValueRef mask_vec)
-{
-    swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
-
-    /* When the output type is points, the geometry shader may output data
-     * to multiple streams, and end_primitive has no effect. Info about
-     * stream id for vertices is stored into the same place in memory where
-     * end primitive info is stored so early exit in this case.
-     */
-    if (iface->pGsState->outputTopology == TOP_POINT_LIST) {
-        return;
-    }
-
-    IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-    Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });
-    Value *vMask1 = TRUNC(vMask, getVectorType(mInt1Ty, 8));
-
-    uint32_t vertsPerPrim = iface->num_verts_per_prim;
-
-    Value *vCount =
-       ADD(MUL(unwrap(emitted_prims_vec), VIMMED1(vertsPerPrim)),
-           unwrap(verts_per_prim_vec));
-
-    vCount = unwrap(total_emitted_vertices_vec);
-
-    Value *mask = unwrap(mask_vec);
-    Value *cmpMask = VMASK(ICMP_NE(unwrap(verts_per_prim_vec), VIMMED1(0)));
-    mask = AND(mask, cmpMask);
-    vMask1 = TRUNC(mask, getVectorType(mInt1Ty, 8));
-
-    vCount = SUB(vCount, VIMMED1(1));
-    Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), VIMMED1(VERTEX_COUNT_SIZE));
-    Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8)));
-
-    vValue = TRUNC(vValue, getVectorType(mInt8Ty, 8));
-
-    Value *pStack = STACKSAVE();
-    Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for lane masking
-
-    for (uint32_t lane = 0; lane < mVWidth; ++lane) {
-       Value *vLaneOffset = VEXTRACT(vOffset, C(lane));
-       Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
-       Value *pStreamOffset = GEP(pStream, vLaneOffset);
-
-       Value *pLaneMask = VEXTRACT(vMask1, C(lane));
-       pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);
-
-       Value *vVal = LOAD(pStreamOffset);
-       vVal = OR(vVal, VEXTRACT(vValue, C(lane)));
-       STORE(vVal, pStreamOffset);
-    }
-
-    STACKRESTORE(pStack);
-}
-
-void
-BuilderSWR::swr_gs_llvm_epilogue(const struct lp_build_gs_iface *gs_base,
-                        LLVMValueRef total_emitted_vertices_vec,
-                        LLVMValueRef emitted_prims_vec, unsigned stream)
-{
-   swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   // Store emit count to each output stream in the first DWORD
-   for (uint32_t lane = 0; lane < mVWidth; ++lane)
-   {
-      Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
-      pStream = BITCAST(pStream, mInt32PtrTy);
-      Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), C(lane));
-      STORE(pLaneCount, pStream);
-   }
-}
-
-void
-BuilderSWR::swr_tcs_llvm_emit_prologue(struct lp_build_tgsi_soa_context* bld)
-{
-   swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld->tcs_iface;
-
-   Value* loop_var = ALLOCA(mSimdInt32Ty);
-   STORE(VBROADCAST(C(0)), loop_var);
-
-   iface->loop_var = wrap(loop_var);
-
-   lp_exec_bgnloop(&bld->exec_mask, true);
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-   bld->system_values.invocation_id = wrap((LOAD(unwrap(iface->loop_var))));
-
-   if (verbose_tcs_shader_loop) {
-      lp_build_print_value(gallivm, "Prologue LOOP Iteration BEGIN:", bld->system_values.invocation_id);
-   }
-
-}
-
-void
-BuilderSWR::swr_tcs_llvm_emit_epilogue(struct lp_build_tgsi_soa_context* bld)
-{
-   swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld->tcs_iface;
-
-   struct lp_build_context *uint_bld = &bld->bld_base.uint_bld;
-
-   STORE(ADD(LOAD(unwrap(iface->loop_var)), VBROADCAST(C(1))), unwrap(iface->loop_var));
-   if (verbose_tcs_shader_loop) {
-      lp_build_print_value(gallivm, "Epilogue LOOP: ", wrap(LOAD(unwrap(iface->loop_var))));
-   }
-
-   LLVMValueRef tmp = lp_build_cmp(uint_bld, PIPE_FUNC_GEQUAL, wrap(LOAD(unwrap(iface->loop_var))),
-                                   wrap(VBROADCAST(C(iface->output_vertices))));
-   lp_exec_mask_cond_push(&bld->exec_mask, tmp);
-   lp_exec_break(&bld->exec_mask, &bld->bld_base.pc, false);
-   lp_exec_mask_cond_pop(&bld->exec_mask);
-   lp_exec_endloop(bld->bld_base.base.gallivm, &bld->exec_mask);
-}
-
-LLVMValueRef
-BuilderSWR::swr_tcs_llvm_fetch_input(const struct lp_build_tcs_iface *tcs_iface,
-                                     struct lp_build_tgsi_context * bld_base,
-                                     boolean is_vindex_indirect,
-                                     LLVMValueRef vertex_index,
-                                     boolean is_aindex_indirect,
-                                     LLVMValueRef attrib_index,
-                                     LLVMValueRef swizzle_index)
-{
-   swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface;
-
-   Value *vert_index = unwrap(vertex_index);
-   Value *attr_index = unwrap(attrib_index);
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   if (verbose_tcs_shader_in) {
-      lp_build_printf(gallivm, "[TCS IN][VTX] ======================================\n");
-      lp_build_print_value(gallivm, "[TCS IN][VTX] vertex_index: ", vertex_index);
-      lp_build_print_value(gallivm, "[TCS IN][VTX] attrib_index: ", attrib_index);
-      lp_build_printf(gallivm, "[TCS IN][VTX] --------------------------------------\n");
-   }
-
-   Value *res = unwrap(bld_base->base.zero);
-   if (is_vindex_indirect || is_aindex_indirect) {
-      int i;
-      struct lp_type type = bld_base->base.type;
-
-      for (i = 0; i < type.length; i++) {
-         Value *vert_chan_index = vert_index;
-         Value *attr_chan_index = attr_index;
-
-         if (is_vindex_indirect) {
-            vert_chan_index = VEXTRACT(vert_index, C(i));
-         }
-         if (is_aindex_indirect) {
-            attr_chan_index = VEXTRACT(attr_index, C(i));
-         }
-
-         Value *attrib =
-            LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_chan_index}));
-
-         Value *pBase = GEP(iface->pTcsCtx,
-                        { C(0), C(SWR_HS_CONTEXT_vert), vert_chan_index,
-                        C(simdvertex_attrib), attrib, unwrap(swizzle_index), C(i) });
-
-         Value *val = LOAD(pBase);
-
-         if (verbose_tcs_shader_in) {
-            lp_build_print_value(gallivm, "[TCS IN][VTX] vert_chan_index: ", wrap(vert_chan_index));
-            lp_build_print_value(gallivm, "[TCS IN][VTX] attrib_index: ", attrib_index);
-            lp_build_print_value(gallivm, "[TCS IN][VTX] attr_chan_index: ", wrap(attr_index));
-            lp_build_print_value(gallivm, "[TCS IN][VTX] attrib read from map: ", wrap(attrib));
-            lp_build_print_value(gallivm, "[TCS IN][VTX] swizzle_index: ", swizzle_index);
-            lp_build_print_value(gallivm, "[TCS IN][VTX] Loaded: ", wrap(val));
-         }
-         res = VINSERT(res, val, C(i));
-      }
-   } else {
-      Value *attrib = LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_index}));
-
-      Value *pBase = GEP(iface->pTcsCtx,
-                        { C(0), C(SWR_HS_CONTEXT_vert), vert_index,
-                        C(simdvertex_attrib), attrib, unwrap(swizzle_index) });
-
-      res = LOAD(pBase);
-
-      if (verbose_tcs_shader_in) {
-         lp_build_print_value(gallivm, "[TCS IN][VTX] attrib_index: ", attrib_index);
-         lp_build_print_value(gallivm, "[TCS IN][VTX] attr_chan_index: ", wrap(attr_index));
-         lp_build_print_value(gallivm, "[TCS IN][VTX] attrib read from map: ", wrap(attrib));
-         lp_build_print_value(gallivm, "[TCS IN][VTX] swizzle_index: ", swizzle_index);
-         lp_build_print_value(gallivm, "[TCS IN][VTX] Loaded: ", wrap(res));
-      }
-   }
-   if (verbose_tcs_shader_in) {
-      lp_build_print_value(gallivm, "[TCS IN][VTX] returning: ", wrap(res));
-   }
-   return wrap(res);
-}
-
-LLVMValueRef
-BuilderSWR::swr_tcs_llvm_fetch_output(const struct lp_build_tcs_iface *tcs_iface,
-                                      struct lp_build_tgsi_context * bld_base,
-                                      boolean is_vindex_indirect,
-                                      LLVMValueRef vertex_index,
-                                      boolean is_aindex_indirect,
-                                      LLVMValueRef attrib_index,
-                                      LLVMValueRef swizzle_index,
-                                      uint32_t name)
-{
-   swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface;
-
-   Value *vert_index = unwrap(vertex_index);
-   Value *attr_index = unwrap(attrib_index);
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   if (verbose_tcs_shader_in) {
-      lp_build_print_value(gallivm, "[TCS INOUT] Vertex index: ", vertex_index);
-      lp_build_print_value(gallivm, "[TCS INOUT] Attrib index: ", wrap(attr_index));
-      lp_build_print_value(gallivm, "[TCS INOUT] Swizzle index: ", swizzle_index);
-   }
-
-   Value* res = unwrap(bld_base->base.zero);
-
-   for (uint32_t lane = 0; lane < mVWidth; lane++) {
-      Value* p1 = LOAD(iface->pTcsCtx, {0, SWR_HS_CONTEXT_pCPout});
-      Value* pCpOut = GEP(p1, {lane});
-
-      Value *vert_chan_index = vert_index;
-      Value *attr_chan_index = attr_index;
-
-      if (is_vindex_indirect) {
-         vert_chan_index = VEXTRACT(vert_index, C(lane));
-         if (verbose_tcs_shader_in) {
-            lp_build_print_value(gallivm, "[TCS INOUT] Extracted vertex index: ", wrap(vert_chan_index));
-         }
-      }
-
-      if (is_aindex_indirect) {
-         attr_chan_index = VEXTRACT(attr_index, C(lane));
-         if (verbose_tcs_shader_in) {
-            lp_build_print_value(gallivm, "[TCS INOUT] Extracted attrib index: ", wrap(attr_chan_index));
-         }
-      }
-
-      if (name == TGSI_SEMANTIC_TESSOUTER || name == TGSI_SEMANTIC_TESSINNER) {
-         Value* tessFactors = GEP(pCpOut, {(uint32_t)0, ScalarPatch_tessFactors});
-         Value* tessFactorArray = nullptr;
-         if (name == TGSI_SEMANTIC_TESSOUTER) {
-            tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_OuterTessFactors});
-         } else {
-            tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_InnerTessFactors});
-         }
-         Value* tessFactor = GEP(tessFactorArray, {C(0), unwrap(swizzle_index)});
-         res = VINSERT(res, LOAD(tessFactor), C(lane));
-         if (verbose_tcs_shader_in) {
-            lp_build_print_value(gallivm, "[TCS INOUT][FACTOR] lane (patch-id): ", wrap(C(lane)));
-            lp_build_print_value(gallivm, "[TCS INOUT][FACTOR] loaded value: ", wrap(res));
-         }
-      } else if (name == TGSI_SEMANTIC_PATCH) {
-         Value* attr_index_from_map = LOAD(GEP(iface->pPatchOutputAttribMap, {C(0), attr_chan_index}));
-         Value* attr_value = GEP(pCpOut, {C(0), C(ScalarPatch_patchData), C(ScalarCPoint_attrib), attr_index_from_map, unwrap(swizzle_index)});
-         res = VINSERT(res, LOAD(attr_value), C(lane));
-         if (verbose_tcs_shader_in) {
-            lp_build_print_value(gallivm, "[TCS INOUT][PATCH] attr index loaded from map: ", wrap(attr_index_from_map));
-            lp_build_print_value(gallivm, "[TCS INOUT][PATCH] lane (patch-id): ", wrap(C(lane)));
-            lp_build_print_value(gallivm, "[TCS INOUT][PATCH] loaded value: ", wrap(res));
-         }
-      } else {
-         // Generic attribute
-         Value *attrib =
-             LOAD(GEP(iface->pVtxOutputAttribMap, {C(0), attr_chan_index}));
-         if (verbose_tcs_shader_in) {
-            lp_build_print_value(gallivm, "[TCS INOUT][VTX] Attrib index from map: ", wrap(attrib));
-         }
-         Value* attr_chan = GEP(pCpOut, {C(0), C(ScalarPatch_cp), vert_chan_index,
-                                    C(ScalarCPoint_attrib), attrib, unwrap(swizzle_index)});
-
-         res = VINSERT(res, LOAD(attr_chan), C(lane));
-         if (verbose_tcs_shader_in) {
-            lp_build_print_value(gallivm, "[TCS INOUT][VTX] loaded value: ", wrap(res));
-         }
-      }
-   }
-
-   return wrap(res);
-}
-
-void
-BuilderSWR::swr_tcs_llvm_store_output(const struct lp_build_tcs_iface *tcs_iface,
-                                      struct lp_build_tgsi_context *bld_base,
-                                      unsigned name,
-                                      boolean is_vindex_indirect,
-                                      LLVMValueRef vertex_index,
-                                      boolean is_aindex_indirect,
-                                      LLVMValueRef attrib_index,
-                                      LLVMValueRef swizzle_index,
-                                      LLVMValueRef value,
-                                      LLVMValueRef mask_vec)
-{
-   swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface;
-   struct lp_build_tgsi_soa_context* bld = (struct lp_build_tgsi_soa_context*)bld_base;
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-    if (verbose_tcs_shader_out) {
-      lp_build_printf(gallivm, "[TCS OUT] =============================================\n");
-    }
-
-   if (verbose_tcs_shader_out) {
-      lp_build_print_value(gallivm, "[TCS OUT] Store mask: ", bld->exec_mask.exec_mask);
-      lp_build_print_value(gallivm, "[TCS OUT] Store value: ", value);
-   }
-
-   Value *vert_index = unwrap(vertex_index);
-   Value *attr_index = unwrap(attrib_index);
-
-   if (verbose_tcs_shader_out) {
-      lp_build_print_value(gallivm, "[TCS OUT] Vertex index: ", vertex_index);
-      lp_build_print_value(gallivm, "[TCS OUT] Attrib index: ", wrap(attr_index));
-      lp_build_print_value(gallivm, "[TCS OUT] Swizzle index: ", swizzle_index);
-   }
-
-   if (is_vindex_indirect) {
-      vert_index = VEXTRACT(vert_index, C(0));
-      if (verbose_tcs_shader_out) {
-         lp_build_print_value(gallivm, "[TCS OUT] Extracted vertex index: ", vertex_index);
-      }
-   }
-
-   if (is_aindex_indirect) {
-      attr_index = VEXTRACT(attr_index, C(0));
-      if (verbose_tcs_shader_out) {
-         lp_build_print_value(gallivm, "[TCS OUT] Extracted attrib index: ", wrap(attr_index));
-      }
-   }
-
-   if (verbose_tcs_shader_out) {
-      if (bld->exec_mask.has_mask) {
-         lp_build_print_value(gallivm, "[TCS OUT] Exec mask: ", bld->exec_mask.exec_mask);
-      }
-      else {
-         lp_build_printf(gallivm, "[TCS OUT] has no mask\n");
-      }
-   }
-   for (uint32_t lane = 0; lane < mVWidth; lane++) {
-      Value* p1 = LOAD(iface->pTcsCtx, {0, SWR_HS_CONTEXT_pCPout});
-      Value* pCpOut = GEP(p1, {lane});
-
-      if (name == TGSI_SEMANTIC_TESSOUTER || name == TGSI_SEMANTIC_TESSINNER) {
-         Value* tessFactors = GEP(pCpOut, {(uint32_t)0, ScalarPatch_tessFactors});
-         Value* tessFactorArray = nullptr;
-         if (name == TGSI_SEMANTIC_TESSOUTER) {
-            tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_OuterTessFactors});
-         } else {
-            tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_InnerTessFactors});
-         }
-         Value* tessFactor = GEP(tessFactorArray, {C(0), unwrap(swizzle_index)});
-         Value* valueToStore = VEXTRACT(unwrap(value), C(lane));
-         valueToStore = BITCAST(valueToStore, mFP32Ty);
-         if (mask_vec) {
-            Value *originalVal = LOAD(tessFactor);
-            Value *vMask = TRUNC(VEXTRACT(unwrap(mask_vec), C(lane)), mInt1Ty);
-            valueToStore = SELECT(vMask, valueToStore, originalVal);
-         }
-         STORE(valueToStore, tessFactor);
-         if (verbose_tcs_shader_out)
-         {
-            lp_build_print_value(gallivm, "[TCS OUT][FACTOR] Mask_vec mask: ", mask_vec);
-            lp_build_print_value(gallivm, "[TCS OUT][FACTOR] Stored value: ", wrap(valueToStore));
-         }
-      } else if (name == TGSI_SEMANTIC_PATCH) {
-         Value* attrib = LOAD(GEP(iface->pPatchOutputAttribMap, {C(0), attr_index}));
-         if (verbose_tcs_shader_out) {
-            lp_build_print_value(gallivm, "[TCS OUT][PATCH] vert_index: ", wrap(vert_index));
-            lp_build_print_value(gallivm, "[TCS OUT][PATCH] attr_index: ", wrap(attr_index));
-            lp_build_print_value(gallivm, "[TCS OUT][PATCH] vert_index_indirect: ", wrap(C(is_vindex_indirect)));
-            lp_build_print_value(gallivm, "[TCS OUT][PATCH] attr_index_indirect: ", wrap(C(is_aindex_indirect)));
-            lp_build_print_value(gallivm, "[TCS OUT][PATCH] attr index loaded from map: ", wrap(attrib));
-         }
-         Value* attr = GEP(pCpOut, {C(0), C(ScalarPatch_patchData), C(ScalarCPoint_attrib), attrib});
-         Value* value_to_store = VEXTRACT(unwrap(value), C(lane));
-         if (verbose_tcs_shader_out) {
-            lp_build_print_value(gallivm, "[TCS OUT][PATCH] lane (patch-id): ", wrap(C(lane)));
-            lp_build_print_value(gallivm, "[TCS OUT][PATCH] value to store: ", value);
-            lp_build_print_value(gallivm, "[TCS OUT][PATCH] per-patch value to store: ", wrap(value_to_store));
-            lp_build_print_value(gallivm, "[TCS OUT][PATCH] chan_index: ", swizzle_index);
-         }
-         value_to_store = BITCAST(value_to_store, mFP32Ty);
-         if (mask_vec) {
-            Value *originalVal = LOADV(attr, {C(0), unwrap(swizzle_index)});
-            Value *vMask = TRUNC(VEXTRACT(unwrap(mask_vec), C(lane)), mInt1Ty);
-            value_to_store = SELECT(vMask, value_to_store, originalVal);
-            if (verbose_tcs_shader_out) {
-               lp_build_print_value(gallivm, "[TCS OUT][PATCH] store mask: ", mask_vec);
-               lp_build_print_value(gallivm, "[TCS OUT][PATCH] loaded original value: ", wrap(originalVal));
-               lp_build_print_value(gallivm, "[TCS OUT][PATCH] vMask: ", wrap(vMask));
-               lp_build_print_value(gallivm, "[TCS OUT][PATCH] selected value to store: ", wrap(value_to_store));
-            }
-         }
-         STOREV(value_to_store, attr, {C(0), unwrap(swizzle_index)});
-         if (verbose_tcs_shader_out) {
-            lp_build_print_value(gallivm, "[TCS OUT][PATCH] stored value: ", wrap(value_to_store));
-         }
-      } else {
-         Value* value_to_store = VEXTRACT(unwrap(value), C(lane));
-         Value* attrib = LOAD(GEP(iface->pVtxOutputAttribMap, {C(0), attr_index}));
-
-         if (verbose_tcs_shader_out) {
-            lp_build_printf(gallivm, "[TCS OUT] Writting attribute\n");
-            lp_build_print_value(gallivm, "[TCS OUT][VTX] invocation_id: ", bld->system_values.invocation_id);
-            lp_build_print_value(gallivm, "[TCS OUT][VTX] attribIndex: ", wrap(attr_index));
-            lp_build_print_value(gallivm, "[TCS OUT][VTX] attrib read from map: ", wrap(attrib));
-            lp_build_print_value(gallivm, "[TCS OUT][VTX] chan_index: ", swizzle_index);
-            lp_build_print_value(gallivm, "[TCS OUT][VTX] value: ", value);
-            lp_build_print_value(gallivm, "[TCS OUT][VTX] value_to_store: ", wrap(value_to_store));
-         }
-
-         Value* attr_chan = GEP(pCpOut, {C(0), C(ScalarPatch_cp),
-                                    VEXTRACT(unwrap(bld->system_values.invocation_id), C(0)),
-                                    C(ScalarCPoint_attrib), attrib, unwrap(swizzle_index)});
-
-         // Mask output values if needed
-         value_to_store = BITCAST(value_to_store, mFP32Ty);
-         if (mask_vec) {
-            Value *originalVal = LOAD(attr_chan);
-            Value *vMask = TRUNC(VEXTRACT(unwrap(mask_vec), C(lane)), mInt1Ty);
-            value_to_store = SELECT(vMask, value_to_store, originalVal);
-         }
-         STORE(value_to_store, attr_chan);
-         if (verbose_tcs_shader_out) {
-            lp_build_print_value(gallivm, "[TCS OUT][VTX] Mask_vec mask: ", mask_vec);
-            lp_build_print_value(gallivm, "[TCS OUT][VTX] stored: ", wrap(value_to_store));
-         }
-      }
-   }
-}
-
-void
-BuilderSWR::swr_tcs_llvm_emit_barrier(const struct lp_build_tcs_iface *tcs_iface,
-                                      struct lp_build_tgsi_context *bld_base)
-{
-   swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface;
-   struct lp_build_tgsi_soa_context* bld = (struct lp_build_tgsi_soa_context*)bld_base;
-
-   if (verbose_tcs_shader_loop) {
-      lp_build_print_value(gallivm, "Barrier LOOP: Iteration %d END\n", iface->loop_var);
-   }
-
-   struct lp_build_context *uint_bld = &bld->bld_base.uint_bld;
-
-   STORE(ADD(LOAD(unwrap(iface->loop_var)), VBROADCAST(C(1))), unwrap(iface->loop_var));
-
-   LLVMValueRef tmp = lp_build_cmp(uint_bld, PIPE_FUNC_GEQUAL, wrap(LOAD(unwrap(iface->loop_var))),
-                                   wrap(VBROADCAST(C(iface->output_vertices))));
-
-   lp_exec_mask_cond_push(&bld->exec_mask, tmp);
-   lp_exec_break(&bld->exec_mask, &bld->bld_base.pc, false);
-   lp_exec_mask_cond_pop(&bld->exec_mask);
-   lp_exec_endloop(bld->bld_base.base.gallivm, &bld->exec_mask);
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   STORE(VBROADCAST(C(0)), unwrap(iface->loop_var));
-   lp_exec_bgnloop(&bld->exec_mask, true);
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   bld->system_values.invocation_id = wrap((LOAD(unwrap(iface->loop_var))));
-
-   if (verbose_tcs_shader_loop) {
-      lp_build_print_value(gallivm, "Barrier LOOP: Iteration BEGIN: ", iface->loop_var);
-      lp_build_print_value(gallivm, "Barrier LOOP: InvocationId: \n", bld->system_values.invocation_id);
-   }
-}
-
-
-LLVMValueRef
-BuilderSWR::swr_tes_llvm_fetch_patch_input(const struct lp_build_tes_iface *tes_iface,
-                                     struct lp_build_tgsi_context * bld_base,
-                                     boolean is_aindex_indirect,
-                                     LLVMValueRef attrib_index,
-                                     LLVMValueRef swizzle_index)
-{
-    swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface;
-    Value *attr_index = unwrap(attrib_index);
-    Value *res = unwrap(bld_base->base.zero);
-
-    IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   if (verbose_shader) {
-      lp_build_printf(gallivm, "[TES IN][PATCH] --------------------------------------\n");
-   }
-
-    if (is_aindex_indirect) {
-       int i;
-       struct lp_type type = bld_base->base.type;
-
-       for (i = 0; i < type.length; i++) {
-          Value *attr_chan_index = attr_index;
-
-          if (is_aindex_indirect) {
-             attr_chan_index = VEXTRACT(attr_index, C(i));
-          }
-
-          Value *attrib =
-             LOAD(GEP(iface->pPatchAttribMap, {C(0), attr_chan_index}));
-
-          Value *pCpIn = LOAD(iface->pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}, "pCpIn");
-          Value *pPatchData = GEP(pCpIn, {(uint32_t)0, ScalarPatch_patchData});
-          Value *pAttr = GEP(pPatchData, {(uint32_t)0, ScalarCPoint_attrib});
-          Value *Val = LOADV(pAttr, {C(0), attrib, unwrap(swizzle_index)});
-          if (verbose_shader) {
-            lp_build_print_value(gallivm, "[TES IN][PATCH] attrib_index: ", attrib_index);
-            lp_build_print_value(gallivm, "[TES IN][PATCH] attr_chan_index: ", wrap(attr_chan_index));
-            lp_build_print_value(gallivm, "[TES IN][PATCH] attrib read from map: ", wrap(attrib));
-            lp_build_print_value(gallivm, "[TES IN][PATCH] swizzle_index: ", swizzle_index);
-            lp_build_print_value(gallivm, "[TES IN][PATCH] Loaded: ", wrap(Val));
-          }
-          res = VINSERT(res, Val, C(i));
-       }
-    } else {
-      Value *attrib = LOAD(GEP(iface->pPatchAttribMap, {C(0), attr_index}));
-
-      Value *pCpIn = LOAD(iface->pTesCtx, {(uint32_t)0, SWR_DS_CONTEXT_pCpIn}, "pCpIn");
-      Value *pPatchData = GEP(pCpIn, {(uint32_t)0, ScalarPatch_patchData});
-      Value *pAttr = GEP(pPatchData, {(uint32_t)0, ScalarCPoint_attrib});
-      Value *Val = LOADV(pAttr, {C(0), attrib, unwrap(swizzle_index)});
-      if (verbose_shader) {
-         lp_build_print_value(gallivm, "[TES IN][PATCH] attrib_index: ", attrib_index);
-         lp_build_print_value(gallivm, "[TES IN][PATCH] attr_chan_index: ", wrap(attr_index));
-         lp_build_print_value(gallivm, "[TES IN][PATCH] attrib read from map: ", wrap(attrib));
-         lp_build_print_value(gallivm, "[TES IN][PATCH] swizzle_index: ", swizzle_index);
-         lp_build_print_value(gallivm, "[TES IN][PATCH] Loaded: ", wrap(Val));
-      }
-      res = VBROADCAST(Val);
-    }
-    if (verbose_shader) {
-       lp_build_print_value(gallivm, "[TES IN][PATCH] returning: ", wrap(res));
-    }
-    return wrap(res);
-}
-
-
-
-LLVMValueRef
-BuilderSWR::swr_tes_llvm_fetch_vtx_input(const struct lp_build_tes_iface *tes_iface,
-                                     struct lp_build_tgsi_context * bld_base,
-                                     boolean is_vindex_indirect,
-                                     LLVMValueRef vertex_index,
-                                     boolean is_aindex_indirect,
-                                     LLVMValueRef attrib_index,
-                                     LLVMValueRef swizzle_index)
-{
-    swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface;
-    Value *vert_index = unwrap(vertex_index);
-    Value *attr_index = unwrap(attrib_index);
-
-    IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-    if (verbose_shader) {
-      lp_build_printf(gallivm, "[TES IN][VTX] --------------------------------------\n");
-    }
-
-    Value *res = unwrap(bld_base->base.zero);
-    if (is_vindex_indirect || is_aindex_indirect) {
-       int i;
-       struct lp_type type = bld_base->base.type;
-
-       for (i = 0; i < type.length; i++) {
-          Value *vert_chan_index = vert_index;
-          Value *attr_chan_index = attr_index;
-
-          if (is_vindex_indirect) {
-             vert_chan_index = VEXTRACT(vert_index, C(i));
-          }
-          if (is_aindex_indirect) {
-             attr_chan_index = VEXTRACT(attr_index, C(i));
-          }
-
-          Value *attrib =
-             LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_chan_index}));
-
-          Value *pCpIn = LOAD(iface->pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}, "pCpIn");
-          Value *pCp = GEP(pCpIn, {0, ScalarPatch_cp});
-          Value *pVertex = GEP(pCp, {(Value*)C(0), vert_chan_index});
-          Value *pAttrTab = GEP(pVertex, {uint32_t(0), uint32_t(0)});
-          Value *pAttr = GEP(pAttrTab, {(Value*)C(0), attrib});
-          Value *Val = LOADV(pAttr, {C(0), unwrap(swizzle_index)});
-          if (verbose_shader) {
-             lp_build_print_value(gallivm, "[TES IN][VTX] attrib_index: ", attrib_index);
-             lp_build_print_value(gallivm, "[TES IN][VTX] attr_chan_index: ", wrap(attr_index));
-             lp_build_print_value(gallivm, "[TES IN][VTX] attrib read from map: ", wrap(attrib));
-             lp_build_print_value(gallivm, "[TES IN][VTX] swizzle_index: ", swizzle_index);
-             lp_build_print_value(gallivm, "[TES IN][VTX] Loaded: ", wrap(Val));
-          }
-          res = VINSERT(res, Val, C(i));
-       }
-    } else {
-      Value *attrib = LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_index}));
-
-      Value *pCpIn = LOAD(iface->pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}, "pCpIn");
-      Value *pCp = GEP(pCpIn, {0, ScalarPatch_cp});
-      Value *pVertex = GEP(pCp, {(Value*)C(0), vert_index});
-      Value *pAttrTab = GEP(pVertex, {uint32_t(0), uint32_t(0)});
-      Value *pAttr = GEP(pAttrTab, {(Value*)C(0), attrib});
-      Value *Val = LOADV(pAttr, {C(0), unwrap(swizzle_index)});
-      if (verbose_shader) {
-         lp_build_print_value(gallivm, "[TES IN][VTX] attrib_index: ", attrib_index);
-         lp_build_print_value(gallivm, "[TES IN][VTX] attr_chan_index: ", wrap(attr_index));
-         lp_build_print_value(gallivm, "[TES IN][VTX] attrib read from map: ", wrap(attrib));
-         lp_build_print_value(gallivm, "[TES IN][VTX] swizzle_index: ", swizzle_index);
-         lp_build_print_value(gallivm, "[TES IN][VTX] Loaded: ", wrap(Val));
-      }
-      res = VBROADCAST(Val);
-    }
-    if (verbose_shader) {
-       lp_build_print_value(gallivm, "[TES IN][VTX] returning: ", wrap(res));
-    }
-    return wrap(res);
-}
-
-
-
-
-PFN_GS_FUNC
-BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
-{
-   SWR_GS_STATE *pGS = &ctx->gs->gsState;
-   struct tgsi_shader_info *info = &ctx->gs->info.base;
-
-   memset(pGS, 0, sizeof(*pGS));
-
-   pGS->gsEnable = true;
-
-   pGS->numInputAttribs = (VERTEX_ATTRIB_START_SLOT - VERTEX_POSITION_SLOT) + info->num_inputs;
-   pGS->outputTopology =
-      swr_convert_prim_topology(info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM], 0);
-
-   /* It's +1 because emit_vertex in swr is always called exactly one time more
-    * than max_vertices passed in Geometry Shader. We need to allocate more memory
-    * to avoid crash/memory overwritten.
-    */
-   pGS->maxNumVerts = info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] + 1;
-   pGS->instanceCount = info->properties[TGSI_PROPERTY_GS_INVOCATIONS];
-
-   // If point primitive then assume to use multiple streams
-   if(pGS->outputTopology == TOP_POINT_LIST) {
-      pGS->isSingleStream = false;
-   } else {
-      pGS->isSingleStream = true;
-      pGS->singleStreamID = 0;
-   }
-
-   pGS->vertexAttribOffset = VERTEX_POSITION_SLOT;
-   pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;
-   pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;
-   pGS->controlDataSize = 8; // GS outputs max of 8 32B units
-   pGS->controlDataOffset = VERTEX_COUNT_SIZE;
-   pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE;
-
-   pGS->allocationSize =
-      VERTEX_COUNT_SIZE + // vertex count
-      CONTROL_HEADER_SIZE + // control header
-      (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex
-      pGS->maxNumVerts; // num verts
-
-   struct swr_geometry_shader *gs = ctx->gs;
-
-   LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
-   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
-
-   memset(outputs, 0, sizeof(outputs));
-
-   AttrBuilder attrBuilder;
-   attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-
-   std::vector<Type *> gsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
-                              PointerType::get(mInt8Ty, 0),
-                              PointerType::get(Gen_SWR_GS_CONTEXT(JM()), 0)};
-   FunctionType *vsFuncType =
-      FunctionType::get(Type::getVoidTy(JM()->mContext), gsArgs, false);
-
-   // create new vertex shader function
-   auto pFunction = Function::Create(vsFuncType,
-                                     GlobalValue::ExternalLinkage,
-                                     "GS",
-                                     JM()->mpCurrentModule);
-#if LLVM_VERSION_MAJOR < 5
-   AttributeSet attrSet = AttributeSet::get(
-      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
-   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
-#else
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
-#endif
-
-   BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
-   IRB()->SetInsertPoint(block);
-   LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block));
-
-   auto argitr = pFunction->arg_begin();
-   Value *hPrivateData = &*argitr++;
-   hPrivateData->setName("hPrivateData");
-   Value *pWorkerData = &*argitr++;
-   pWorkerData->setName("pWorkerData");
-   Value *pGsCtx = &*argitr++;
-   pGsCtx->setName("gsCtx");
-
-   Value *consts_ptr =
-      GEP(hPrivateData, {C(0), C(swr_draw_context_constantGS)});
-   consts_ptr->setName("gs_constants");
-   Value *const_sizes_ptr =
-      GEP(hPrivateData, {0, swr_draw_context_num_constantsGS});
-   const_sizes_ptr->setName("num_gs_constants");
-
-   struct lp_build_sampler_soa *sampler =
-      swr_sampler_soa_create(key.sampler, PIPE_SHADER_GEOMETRY);
-   assert(sampler != nullptr);
-
-   struct lp_bld_tgsi_system_values system_values;
-   memset(&system_values, 0, sizeof(system_values));
-   system_values.prim_id = wrap(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_PrimitiveID}));
-   system_values.invocation_id = wrap(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_InstanceID}));
-
-   std::vector<Constant*> mapConstants;
-   Value *vtxAttribMap = ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS));
-   for (unsigned slot = 0; slot < info->num_inputs; slot++) {
-      ubyte semantic_name = info->input_semantic_name[slot];
-      ubyte semantic_idx = info->input_semantic_index[slot];
-
-      unsigned vs_slot = locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base);
-      assert(vs_slot < PIPE_MAX_SHADER_OUTPUTS);
-
-      vs_slot += VERTEX_ATTRIB_START_SLOT;
-
-      if (ctx->vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION)
-         vs_slot--;
-
-      if (semantic_name == TGSI_SEMANTIC_POSITION)
-         vs_slot = VERTEX_POSITION_SLOT;
-
-      STORE(C(vs_slot), vtxAttribMap, {0, slot});
-      mapConstants.push_back(C(vs_slot));
-   }
-
-   struct lp_build_mask_context mask;
-   Value *mask_val = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_mask}, "gsMask");
-   lp_build_mask_begin(&mask, gallivm,
-                       lp_type_float_vec(32, 32 * 8), wrap(mask_val));
-
-   // zero out cut buffer so we can load/modify/store bits
-   for (uint32_t lane = 0; lane < mVWidth; ++lane)
-   {
-      Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
-#if LLVM_VERSION_MAJOR >= 10
-      MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, MaybeAlign(sizeof(float) * KNOB_SIMD_WIDTH));
-#else
-      MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, sizeof(float) * KNOB_SIMD_WIDTH);
-#endif
-   }
-
-   struct swr_gs_llvm_iface gs_iface;
-   gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input;
-   gs_iface.base.emit_vertex = ::swr_gs_llvm_emit_vertex;
-   gs_iface.base.end_primitive = ::swr_gs_llvm_end_primitive;
-   gs_iface.base.gs_epilogue = ::swr_gs_llvm_epilogue;
-   gs_iface.pBuilder = this;
-   gs_iface.pGsCtx = pGsCtx;
-   gs_iface.pGsState = pGS;
-   gs_iface.num_outputs = gs->info.base.num_outputs;
-   gs_iface.num_verts_per_prim =
-      u_vertices_per_prim((pipe_prim_type)info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]);
-   gs_iface.info = info;
-   gs_iface.pVtxAttribMap = vtxAttribMap;
-
-   struct lp_build_tgsi_params params;
-   memset(&params, 0, sizeof(params));
-   params.type = lp_type_float_vec(32, 32 * 8);
-   params.mask = & mask;
-   params.consts_ptr = wrap(consts_ptr);
-   params.const_sizes_ptr = wrap(const_sizes_ptr);
-   params.system_values = &system_values;
-   params.inputs = inputs;
-   params.context_ptr = wrap(hPrivateData);
-   params.sampler = sampler;
-   params.info = &gs->info.base;
-   params.gs_iface = &gs_iface.base;
-
-   lp_build_tgsi_soa(gallivm,
-                     gs->pipe.tokens,
-                     &params,
-                     outputs);
-
-   lp_build_mask_end(&mask);
-
-   sampler->destroy(sampler);
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   RET_VOID();
-
-   gallivm_verify_function(gallivm, wrap(pFunction));
-   gallivm_compile_module(gallivm);
-
-   PFN_GS_FUNC pFunc =
-      (PFN_GS_FUNC)gallivm_jit_function(gallivm, wrap(pFunction));
-
-   debug_printf("geom shader  %p\n", pFunc);
-   assert(pFunc && "Error: GeomShader = NULL");
-
-   JM()->mIsModuleFinalized = true;
-
-   return pFunc;
-}
-
-PFN_TES_FUNC
-BuilderSWR::CompileTES(struct swr_context *ctx, swr_jit_tes_key &key)
-{
-   SWR_TS_STATE *pTS = &ctx->tsState;
-   struct tgsi_shader_info *info = &ctx->tes->info.base;
-
-   // tessellation is enabled if TES is present
-   // clear tessellation state here then
-   memset(pTS, 0, sizeof(*pTS));
-
-   pTS->tsEnable = true;
-
-   unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
-   unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
-   bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
-   bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
-   SWR_TS_DOMAIN type = SWR_TS_ISOLINE;
-   SWR_TS_PARTITIONING partitioning = SWR_TS_EVEN_FRACTIONAL;
-   SWR_TS_OUTPUT_TOPOLOGY topology = SWR_TS_OUTPUT_POINT;
-   PRIMITIVE_TOPOLOGY postDSTopology = TOP_POINT_LIST;
-
-   // TESS_TODO: move this to helper functions to improve readability
-   switch (tes_prim_mode) {
-   case PIPE_PRIM_LINES:
-      type = SWR_TS_ISOLINE;
-      postDSTopology = TOP_LINE_LIST;
-      break;
-   case PIPE_PRIM_TRIANGLES:
-      type = SWR_TS_TRI;
-      postDSTopology = TOP_TRIANGLE_LIST;
-      break;
-   case PIPE_PRIM_QUADS:
-      type = SWR_TS_QUAD;
-      // See OpenGL spec - quads are tessellated into triangles
-      postDSTopology = TOP_TRIANGLE_LIST;
-      break;
-   default:
-      assert(0);
-   }
-
-   switch (tes_spacing) {
-   case PIPE_TESS_SPACING_FRACTIONAL_ODD:
-      partitioning = SWR_TS_ODD_FRACTIONAL;
-      break;
-   case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
-      partitioning = SWR_TS_EVEN_FRACTIONAL;
-      break;
-   case PIPE_TESS_SPACING_EQUAL:
-      partitioning = SWR_TS_INTEGER;
-      break;
-   default:
-      assert(0);
-   }
-
-   if (tes_point_mode) {
-      topology = SWR_TS_OUTPUT_POINT;
-      postDSTopology = TOP_POINT_LIST;
-   }
-   else if (tes_prim_mode == PIPE_PRIM_LINES) {
-      topology = SWR_TS_OUTPUT_LINE;
-   }
-   else if (tes_vertex_order_cw) {
-      topology = SWR_TS_OUTPUT_TRI_CW;
-   }
-   else {
-      topology = SWR_TS_OUTPUT_TRI_CCW;
-   }
-
-   pTS->domain = type;
-   pTS->tsOutputTopology = topology;
-   pTS->partitioning = partitioning;
-   pTS->numDsOutputAttribs = info->num_outputs;
-   pTS->postDSTopology = postDSTopology;
-
-   pTS->dsAllocationSize = SWR_VTX_NUM_SLOTS * MAX_NUM_VERTS_PER_PRIM;
-   pTS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT;
-   pTS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT;
-   pTS->dsOutVtxAttribOffset = VERTEX_ATTRIB_START_SLOT;
-
-   struct swr_tess_evaluation_shader *tes = ctx->tes;
-
-   LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
-   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
-
-   memset(outputs, 0, sizeof(outputs));
-
-   AttrBuilder attrBuilder;
-   attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-
-   std::vector<Type *> tesArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
-                               PointerType::get(mInt8Ty, 0),
-                               PointerType::get(Gen_SWR_DS_CONTEXT(JM()), 0)};
-   FunctionType *tesFuncType =
-      FunctionType::get(Type::getVoidTy(JM()->mContext), tesArgs, false);
-
-   // create new vertex shader function
-   auto pFunction = Function::Create(tesFuncType,
-                                     GlobalValue::ExternalLinkage,
-                                     "TES",
-                                     JM()->mpCurrentModule);
-
-#if LLVM_VERSION_MAJOR < 5
-   AttributeSet attrSet = AttributeSet::get(
-      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
-   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
-#else
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
-#endif
-
-   BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
-   IRB()->SetInsertPoint(block);
-   LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block));
-
-   auto argitr = pFunction->arg_begin();
-   Value *hPrivateData = &*argitr++;
-   hPrivateData->setName("hPrivateData");
-   Value *pWorkerData = &*argitr++;
-   pWorkerData->setName("pWorkerData");
-   Value *pTesCtx = &*argitr++;
-   pTesCtx->setName("tesCtx");
-
-   Value *consts_ptr =
-      GEP(hPrivateData, {C(0), C(swr_draw_context_constantTES)});
-   consts_ptr->setName("tes_constants");
-   Value *const_sizes_ptr =
-      GEP(hPrivateData, {0, swr_draw_context_num_constantsTES});
-   const_sizes_ptr->setName("num_tes_constants");
-
-   struct lp_build_sampler_soa *sampler =
-      swr_sampler_soa_create(key.sampler, PIPE_SHADER_TESS_EVAL);
-   assert(sampler != nullptr);
-
-   struct lp_bld_tgsi_system_values system_values;
-   memset(&system_values, 0, sizeof(system_values));
-
-   // Load and calculate system values
-   // Tessellation coordinates (gl_TessCoord)
-   Value *vecOffset = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_vectorOffset}, "vecOffset");
-   Value *vecStride = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_vectorStride}, "vecStride");
-   Value *vecIndex  = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_vectorOffset});
-
-   Value* tess_coord = ALLOCA(ArrayType::get(mSimdFP32Ty, 3));
-
-   Value *tessCoordU = LOADV(LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pDomainU}), {vecIndex}, "tessCoordU");
-   STORE(tessCoordU, tess_coord, {0, 0});
-   Value *tessCoordV = LOADV(LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pDomainV}), {vecIndex}, "tessCoordV");
-   STORE(tessCoordV, tess_coord, {0, 1});
-   Value *tessCoordW = FSUB(FSUB(VIMMED1(1.0f), tessCoordU), tessCoordV, "tessCoordW");
-   STORE(tessCoordW, tess_coord, {0, 2});
-   system_values.tess_coord = wrap(tess_coord);
-
-   // Primitive ID
-   system_values.prim_id = wrap(VBROADCAST(LOAD(pTesCtx, {0, SWR_DS_CONTEXT_PrimitiveID}), "PrimitiveID"));
-
-   // Tessellation factors
-   Value* pPatch = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pCpIn});
-   Value* pTessFactors = GEP(pPatch, {C(0), C(ScalarPatch_tessFactors)});
-
-   assert(SWR_NUM_OUTER_TESS_FACTORS == 4);
-   Value* sys_value_outer_factors = UndefValue::get(getVectorType(mFP32Ty, 4));
-   for (unsigned i = 0; i < SWR_NUM_OUTER_TESS_FACTORS; i++) {
-      Value* v = LOAD(pTessFactors, {0, SWR_TESSELLATION_FACTORS_OuterTessFactors, i});
-      sys_value_outer_factors = VINSERT(sys_value_outer_factors, v, i, "gl_TessLevelOuter");
-   }
-   system_values.tess_outer = wrap(sys_value_outer_factors);
-
-   assert(SWR_NUM_INNER_TESS_FACTORS == 2);
-   Value* sys_value_inner_factors = UndefValue::get(getVectorType(mFP32Ty, 4));
-   for (unsigned i = 0; i < SWR_NUM_INNER_TESS_FACTORS; i++) {
-      Value* v = LOAD(pTessFactors, {0, SWR_TESSELLATION_FACTORS_InnerTessFactors, i});
-      sys_value_inner_factors = VINSERT(sys_value_inner_factors, v, i, "gl_TessLevelInner");
-   }
-   system_values.tess_inner = wrap(sys_value_inner_factors);
-
-   if (verbose_shader)
-   {
-      lp_build_print_value(gallivm, "tess_coord = ", system_values.tess_coord);
-   }
-
-   struct tgsi_shader_info *pPrevShader = nullptr;
-
-   if (ctx->tcs) {
-      pPrevShader = &ctx->tcs->info.base;
-   }
-   else {
-      pPrevShader = &ctx->vs->info.base;
-   }
-
-   // Figure out how many per-patch attributes we have
-   unsigned perPatchAttrs = 0;
-   unsigned genericAttrs = 0;
-   unsigned tessLevelAttrs = 0;
-   unsigned sgvAttrs = 0;
-   for (unsigned slot = 0; slot < pPrevShader->num_outputs; slot++) {
-      switch (pPrevShader->output_semantic_name[slot]) {
-      case TGSI_SEMANTIC_PATCH:
-         perPatchAttrs++;
-         break;
-      case TGSI_SEMANTIC_GENERIC:
-         genericAttrs++;
-         break;
-      case TGSI_SEMANTIC_TESSINNER:
-      case TGSI_SEMANTIC_TESSOUTER:
-         tessLevelAttrs++;
-         break;
-      case TGSI_SEMANTIC_POSITION:
-      case TGSI_SEMANTIC_CLIPDIST:
-      case TGSI_SEMANTIC_PSIZE:
-         sgvAttrs++;
-         break;
-      default:
-         assert(!"Unknown semantic input in TES");
-      }
-   }
-
-   std::vector<Constant *> mapConstants;
-   Value *vtxAttribMap = ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS));
-   Value *patchAttribMap = ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS));
-   for (unsigned slot = 0; slot < info->num_inputs; slot++) {
-      ubyte semantic_name = info->input_semantic_name[slot];
-      ubyte semantic_idx = info->input_semantic_index[slot];
-
-      // Where in TCS output is my attribute?
-      // TESS_TODO: revisit after implement pass-through TCS
-      unsigned tcs_slot = locate_linkage(semantic_name, semantic_idx, pPrevShader);
-      assert(tcs_slot < PIPE_MAX_SHADER_OUTPUTS);
-
-      // Skip tessellation levels - these go to the tessellator, not TES
-      switch (semantic_name) {
-      case TGSI_SEMANTIC_GENERIC:
-         tcs_slot = tcs_slot + VERTEX_ATTRIB_START_SLOT - sgvAttrs - tessLevelAttrs;
-         break;
-      case TGSI_SEMANTIC_PATCH:
-         tcs_slot = semantic_idx;
-         break;
-      case TGSI_SEMANTIC_POSITION:
-         tcs_slot = VERTEX_POSITION_SLOT;
-         break;
-      case TGSI_SEMANTIC_CLIPDIST:
-      case TGSI_SEMANTIC_PSIZE:
-         break;
-      default:
-         assert(!"Unexpected semantic found while building TES input map");
-      }
-      if (semantic_name == TGSI_SEMANTIC_PATCH) {
-         STORE(C(tcs_slot), patchAttribMap, {0, slot});
-      } else {
-         STORE(C(tcs_slot), vtxAttribMap, {0, slot});
-      }
-      mapConstants.push_back(C(tcs_slot));
-   }
-
-   // Build execution mask
-   struct lp_build_mask_context mask;
-   Value *mask_val = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_mask}, "tesMask");
-
-   if (verbose_shader)
-      lp_build_print_value(gallivm, "TES execution mask: ", wrap(mask_val));
-
-   lp_build_mask_begin(&mask, gallivm,
-                       lp_type_float_vec(32, 32 * 8), wrap(mask_val));
-
-   struct swr_tes_llvm_iface tes_iface;
-
-   tes_iface.base.fetch_vertex_input = ::swr_tes_llvm_fetch_vtx_input;
-   tes_iface.base.fetch_patch_input = ::swr_tes_llvm_fetch_patch_input;
-
-   tes_iface.pBuilder = this;
-   tes_iface.pTesCtx = pTesCtx;
-   tes_iface.pTsState = pTS;
-   tes_iface.num_outputs = tes->info.base.num_outputs;
-   tes_iface.info = info;
-   tes_iface.pVtxAttribMap = vtxAttribMap;
-   tes_iface.pPatchAttribMap = patchAttribMap;
-
-   struct lp_build_tgsi_params params;
-   memset(&params, 0, sizeof(params));
-   params.type = lp_type_float_vec(32, 32 * 8);
-   params.mask = & mask;
-   params.consts_ptr = wrap(consts_ptr);
-   params.const_sizes_ptr = wrap(const_sizes_ptr);
-   params.system_values = &system_values;
-   params.inputs = inputs;
-   params.context_ptr = wrap(hPrivateData);
-   params.sampler = sampler;
-   params.info = &tes->info.base;
-   params.tes_iface = &tes_iface.base;
-
-   // Build LLVM IR
-   lp_build_tgsi_soa(gallivm,
-                     tes->pipe.tokens,
-                     &params,
-                     outputs);
-
-   lp_build_mask_end(&mask);
-
-   sampler->destroy(sampler);
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   // Write output attributes
-   Value *dclOut = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pOutputData}, "dclOut");
-
-   for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) {
-      for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
-         if (!outputs[attrib][channel])
-            continue;
-
-         Value *val = LOAD(unwrap(outputs[attrib][channel]));;
-         Value *attribOffset =
-            LOAD(pTesCtx, {0, SWR_DS_CONTEXT_outVertexAttribOffset});
-
-         // Assume we write possition
-         Value* outputSlot = C(VERTEX_POSITION_SLOT);
-         if (tes->info.base.output_semantic_name[attrib] != TGSI_SEMANTIC_POSITION) {
-            // No, it's a generic attribute, not a position - let's calculate output slot
-            uint32_t outSlot = attrib;
-            if (tes->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
-               // this shader will write position, so in shader's term
-               // output starts at attrib 1, but we will handle that separately,
-               // so let's fix the outSlot
-               outSlot--;
-            }
-            outputSlot = ADD(attribOffset, C(outSlot));
-         }
-
-         Value *attribVecIndex =
-            ADD(MUL(vecStride, MUL(outputSlot, C(4))), vecOffset);
-
-         uint32_t outputComponent = 0;
-         uint32_t curComp = outputComponent + channel;
-         auto outValIndex = ADD(attribVecIndex, MUL(vecStride, C(curComp)));
-         STOREV(val, dclOut, {outValIndex});
-
-         if (verbose_shader) {
-             lp_build_printf(gallivm,
-                            "TES output [%d][%d]",
-                            C(attrib),
-                            C(channel));
-            lp_build_print_value(gallivm, " = ", wrap(val));
-         }
-      }
-   }
-
-   RET_VOID();
-
-   JM()->DumpToFile(pFunction, "src");
-   gallivm_verify_function(gallivm, wrap(pFunction));
-
-   gallivm_compile_module(gallivm);
-   JM()->DumpToFile(pFunction, "optimized");
-
-   PFN_TES_FUNC pFunc =
-      (PFN_TES_FUNC)gallivm_jit_function(gallivm, wrap(pFunction));
-
-   debug_printf("tess evaluation shader  %p\n", pFunc);
-   assert(pFunc && "Error: TessEvaluationShader = NULL");
-
-   JM()->DumpAsm(pFunction, "asm");
-
-   JM()->mIsModuleFinalized = true;
-
-   return pFunc;
-}
-
-PFN_TCS_FUNC
-BuilderSWR::CompileTCS(struct swr_context *ctx, swr_jit_tcs_key &key)
-{
-   SWR_TS_STATE *pTS = &ctx->tsState;
-   struct tgsi_shader_info *info = &ctx->tcs->info.base;
-
-   pTS->numHsInputAttribs = info->num_inputs;
-   pTS->numHsOutputAttribs = info->num_outputs;
-
-   pTS->hsAllocationSize = sizeof(ScalarPatch);
-
-   pTS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT;
-   pTS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT;
-
-   struct swr_tess_control_shader *tcs = ctx->tcs;
-
-   LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
-   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
-
-   memset(outputs, 0, sizeof(outputs));
-
-   AttrBuilder attrBuilder;
-   attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-
-   std::vector<Type *> tcsArgs{
-      PointerType::get(Gen_swr_draw_context(JM()), 0),
-      PointerType::get(mInt8Ty, 0),
-      PointerType::get(Gen_SWR_HS_CONTEXT(JM()), 0)};
-   FunctionType *tcsFuncType =
-      FunctionType::get(Type::getVoidTy(JM()->mContext), tcsArgs, false);
-
-   // create new vertex shader function
-   auto pFunction = Function::Create(tcsFuncType,
-                                     GlobalValue::ExternalLinkage,
-                                     "TCS",
-                                     JM()->mpCurrentModule);
-
-#if LLVM_VERSION_MAJOR < 5
-   AttributeSet attrSet = AttributeSet::get(
-      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
-   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
-#else
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
-#endif
-
-   BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
-   IRB()->SetInsertPoint(block);
-   LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block));
-
-   auto argitr = pFunction->arg_begin();
-   Value *hPrivateData = &*argitr++;
-   hPrivateData->setName("hPrivateData");
-   Value *pWorkerData = &*argitr++;
-   pWorkerData->setName("pWorkerData");
-   Value *pTcsCtx = &*argitr++;
-   pTcsCtx->setName("tcsCtx");
-
-   Value *consts_ptr =
-      GEP(hPrivateData, {C(0), C(swr_draw_context_constantTCS)});
-   consts_ptr->setName("tcs_constants");
-   Value *const_sizes_ptr =
-      GEP(hPrivateData, {0, swr_draw_context_num_constantsTCS});
-   const_sizes_ptr->setName("num_tcs_constants");
-
-   struct lp_build_sampler_soa *sampler =
-      swr_sampler_soa_create(key.sampler, PIPE_SHADER_TESS_CTRL);
-   assert(sampler != nullptr);
-
-   struct lp_bld_tgsi_system_values system_values;
-   memset(&system_values, 0, sizeof(system_values));
-
-   system_values.prim_id =
-      wrap(LOAD(pTcsCtx, {0, SWR_HS_CONTEXT_PrimitiveID}));
-
-   system_values.invocation_id = wrap(VBROADCAST(C(0)));
-   system_values.vertices_in = wrap(C(tcs->vertices_per_patch));
-
-   if (verbose_shader) {
-      lp_build_print_value(gallivm, "TCS::prim_id = ", system_values.prim_id);
-      lp_build_print_value(gallivm, "TCS::invocation_id = ", system_values.invocation_id);
-      lp_build_print_value(gallivm, "TCS::vertices_in = ", system_values.vertices_in);
-   }
-
-   std::vector<Constant *> mapConstants;
-   Value *vtxAttribMap =
-      ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS));
-
-   for (unsigned slot = 0; slot < info->num_inputs; slot++) {
-      ubyte semantic_name = info->input_semantic_name[slot];
-      ubyte semantic_idx = info->input_semantic_index[slot];
-
-      unsigned vs_slot =
-         locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base);
-      assert(vs_slot < PIPE_MAX_SHADER_OUTPUTS);
-
-      vs_slot += VERTEX_ATTRIB_START_SLOT;
-
-      if (ctx->vs->info.base.output_semantic_name[0]
-          == TGSI_SEMANTIC_POSITION)
-         vs_slot--;
-
-      if (semantic_name == TGSI_SEMANTIC_POSITION)
-         vs_slot = VERTEX_POSITION_SLOT;
-
-      STORE(C(vs_slot), vtxAttribMap, {0, slot});
-      mapConstants.push_back(C(vs_slot));
-   }
-
-   // Prepare map of output attributes. Needed when shader instance wants
-   // to read own output or output of other instance, which is allowed in TCS
-   Value *vtxOutputAttribMap =
-      ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS));
-   // Map for per-patch attributes
-   Value *patchOutputAttribMap =
-      ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS));
-   for (unsigned slot = 0; slot < info->num_outputs; slot++) {
-      ubyte name = info->output_semantic_name[slot];
-      int32_t idx = info->output_semantic_index[slot];
-      if (name == TGSI_SEMANTIC_PATCH) {
-         STORE(C(idx), patchOutputAttribMap, {0, slot});
-      } else {
-         int32_t target_slot = slot;
-         if (name == TGSI_SEMANTIC_GENERIC) {
-            target_slot += VERTEX_ATTRIB_START_SLOT;
-         }
-         // Now normalize target slot
-         for (ubyte as = 0; as < slot; as++) {
-            ubyte name = info->output_semantic_name[as];
-            switch (name) {
-               case TGSI_SEMANTIC_TESSOUTER:
-               case TGSI_SEMANTIC_TESSINNER:
-               case TGSI_SEMANTIC_PATCH:
-               case TGSI_SEMANTIC_POSITION:
-                  target_slot--;
-            }
-         }
-         if (name == TGSI_SEMANTIC_POSITION) {
-            target_slot = VERTEX_POSITION_SLOT;
-         }
-         STORE(C(target_slot), vtxOutputAttribMap, {0, slot});
-         mapConstants.push_back(C(target_slot));
-      }
-   }
-
-   struct lp_build_mask_context mask;
-   Value *mask_val = LOAD(pTcsCtx, {0, SWR_HS_CONTEXT_mask}, "tcsMask");
-   lp_build_mask_begin(
-      &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(mask_val));
-
-   struct swr_tcs_llvm_iface tcs_iface;
-
-   tcs_iface.base.emit_store_output = ::swr_tcs_llvm_store_output;
-   tcs_iface.base.emit_fetch_input = ::swr_tcs_llvm_fetch_input;
-   tcs_iface.base.emit_fetch_output = ::swr_tcs_llvm_fetch_output;
-   tcs_iface.base.emit_barrier = ::swr_tcs_llvm_emit_barrier;
-   tcs_iface.base.emit_prologue = ::swr_tcs_llvm_emit_prologue;
-   tcs_iface.base.emit_epilogue = ::swr_tcs_llvm_emit_epilogue;
-
-   tcs_iface.pBuilder = this;
-   tcs_iface.pTcsCtx = pTcsCtx;
-   tcs_iface.pTsState = pTS;
-   tcs_iface.output_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
-   tcs_iface.info = info;
-   tcs_iface.pVtxAttribMap = vtxAttribMap;
-   tcs_iface.pVtxOutputAttribMap = vtxOutputAttribMap;
-   tcs_iface.pPatchOutputAttribMap = patchOutputAttribMap;
-
-   struct lp_build_tgsi_params params;
-   memset(&params, 0, sizeof(params));
-   params.type = lp_type_float_vec(32, 32 * 8);
-   params.mask = &mask;
-   params.consts_ptr = wrap(consts_ptr);
-   params.const_sizes_ptr = wrap(const_sizes_ptr);
-   params.system_values = &system_values;
-   params.inputs = inputs;
-   params.context_ptr = wrap(hPrivateData);
-   params.sampler = sampler;
-   params.info = &tcs->info.base;
-   params.tcs_iface = &tcs_iface.base;
-
-   lp_build_tgsi_soa(gallivm, tcs->pipe.tokens, &params, outputs);
-
-   lp_build_mask_end(&mask);
-
-   sampler->destroy(sampler);
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-   RET_VOID();
-
-   JM()->DumpToFile(pFunction, "src");
-   gallivm_verify_function(gallivm, wrap(pFunction));
-   gallivm_compile_module(gallivm);
-   JM()->DumpToFile(pFunction, "optimized");
-
-   PFN_TCS_FUNC pFunc =
-      (PFN_TCS_FUNC)gallivm_jit_function(gallivm, wrap(pFunction));
-
-   debug_printf("tess control shader  %p\n", pFunc);
-   assert(pFunc && "Error: TessControlShader = NULL");
-   JM()->DumpAsm(pFunction, "asm");
-
-   JM()->mIsModuleFinalized = true;
-
-   return pFunc;
-}
-
-
-PFN_GS_FUNC
-swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key &key)
-{
-   BuilderSWR builder(
-      reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr),
-      "GS");
-   PFN_GS_FUNC func = builder.CompileGS(ctx, key);
-
-   ctx->gs->map.insert(std::make_pair(key, std::unique_ptr<VariantGS>(new VariantGS(builder.gallivm, func))));
-   return func;
-}
-
-PFN_TCS_FUNC
-swr_compile_tcs(struct swr_context *ctx, swr_jit_tcs_key &key)
-{
-   BuilderSWR builder(
-      reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr),
-      "TCS");
-   PFN_TCS_FUNC func = builder.CompileTCS(ctx, key);
-
-   ctx->tcs->map.insert(
-      std::make_pair(key, std::unique_ptr<VariantTCS>(new VariantTCS(builder.gallivm, func))));
-
-   return func;
-}
-
-PFN_TES_FUNC
-swr_compile_tes(struct swr_context *ctx, swr_jit_tes_key &key)
-{
-   BuilderSWR builder(
-      reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr),
-      "TES");
-   PFN_TES_FUNC func = builder.CompileTES(ctx, key);
-
-   ctx->tes->map.insert(
-      std::make_pair(key, std::unique_ptr<VariantTES>(new VariantTES(builder.gallivm, func))));
-
-   return func;
-}
-
-void
-BuilderSWR::WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, unsigned slot, unsigned channel)
-{
-#if USE_SIMD16_FRONTEND && !USE_SIMD16_VS
-   // interleave the simdvertex components into the dest simd16vertex
-   //   slot16offset = slot8offset * 2
-   //   comp16offset = comp8offset * 2 + alternateOffset
-
-   Value *offset = LOAD(pVsContext, { 0, SWR_VS_CONTEXT_AlternateOffset });
-   Value *pOut = GEP(pVtxOutput, { C(0), C(0), C(slot * 2), offset } );
-   STORE(pVal, pOut, {channel * 2});
-#else
-   Value *pOut = GEP(pVtxOutput, {0, 0, slot});
-   STORE(pVal, pOut, {0, channel});
-   if (verbose_vs_shader) {
-      lp_build_printf(gallivm, "VS: Storing on slot %d, channel %d: ", C(slot), C(channel));
-      lp_build_print_value(gallivm, "", wrap(pVal));
-   }
-#endif
-}
-
-PFN_VERTEX_FUNC
-BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
-{
-   struct swr_vertex_shader *swr_vs = ctx->vs;
-
-   LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
-   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
-
-   memset(outputs, 0, sizeof(outputs));
-
-   AttrBuilder attrBuilder;
-   attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-
-   std::vector<Type *> vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
-                              PointerType::get(mInt8Ty, 0),
-                              PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)};
-   FunctionType *vsFuncType =
-      FunctionType::get(Type::getVoidTy(JM()->mContext), vsArgs, false);
-
-   // create new vertex shader function
-   auto pFunction = Function::Create(vsFuncType,
-                                     GlobalValue::ExternalLinkage,
-                                     "VS",
-                                     JM()->mpCurrentModule);
-#if LLVM_VERSION_MAJOR < 5
-   AttributeSet attrSet = AttributeSet::get(
-      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
-   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
-#else
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
-#endif
-
-   BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
-   IRB()->SetInsertPoint(block);
-   LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block));
-
-   auto argitr = pFunction->arg_begin();
-   Value *hPrivateData = &*argitr++;
-   hPrivateData->setName("hPrivateData");
-   Value *pWorkerData = &*argitr++;
-   pWorkerData->setName("pWorkerData");
-   Value *pVsCtx = &*argitr++;
-   pVsCtx->setName("vsCtx");
-
-   Value *consts_ptr = GEP(hPrivateData, {C(0), C(swr_draw_context_constantVS)});
-
-   consts_ptr->setName("vs_constants");
-   Value *const_sizes_ptr =
-      GEP(hPrivateData, {0, swr_draw_context_num_constantsVS});
-   const_sizes_ptr->setName("num_vs_constants");
-
-   Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin});
-#if USE_SIMD16_VS
-   vtxInput = BITCAST(vtxInput, PointerType::get(Gen_simd16vertex(JM()), 0));
-#endif
-
-   for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) {
-      const unsigned mask = swr_vs->info.base.input_usage_mask[attrib];
-      for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
-         if (mask & (1 << channel)) {
-            inputs[attrib][channel] =
-               wrap(LOAD(vtxInput, {0, 0, attrib, channel}));
-         }
-      }
-   }
-
-   struct lp_build_sampler_soa *sampler =
-      swr_sampler_soa_create(key.sampler, PIPE_SHADER_VERTEX);
-   assert(sampler != nullptr);
-
-   struct lp_bld_tgsi_system_values system_values;
-   memset(&system_values, 0, sizeof(system_values));
-   system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID}));
-
-#if USE_SIMD16_VS
-   system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID16}));
-#else
-   system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID}));
-#endif
-
-#if USE_SIMD16_VS
-   uint32_t vectorWidth = mVWidth16;
-#else
-   uint32_t vectorWidth = mVWidth;
-#endif
-
-   struct lp_build_tgsi_params params;
-   memset(&params, 0, sizeof(params));
-   params.type = lp_type_float_vec(32, 32 * vectorWidth);
-   params.consts_ptr = wrap(consts_ptr);
-   params.const_sizes_ptr = wrap(const_sizes_ptr);
-   params.system_values = &system_values;
-   params.inputs = inputs;
-   params.context_ptr = wrap(hPrivateData);
-   params.sampler = sampler;
-   params.info = &swr_vs->info.base;
-
-   lp_build_tgsi_soa(gallivm,
-                     swr_vs->pipe.tokens,
-                     &params,
-                     outputs);
-
-   sampler->destroy(sampler);
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout});
-#if USE_SIMD16_VS
-   vtxOutput = BITCAST(vtxOutput, PointerType::get(Gen_simd16vertex(JM()), 0));
-#endif
-
-   for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
-      for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) {
-         if (!outputs[attrib][channel])
-            continue;
-
-         Value *val;
-         uint32_t outSlot;
-
-         if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) {
-            if (channel != VERTEX_SGV_POINT_SIZE_COMP)
-               continue;
-            val = LOAD(unwrap(outputs[attrib][0]));
-            outSlot = VERTEX_SGV_SLOT;
-         } else if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_POSITION) {
-            val = LOAD(unwrap(outputs[attrib][channel]));
-            outSlot = VERTEX_POSITION_SLOT;
-         } else {
-            val = LOAD(unwrap(outputs[attrib][channel]));
-            outSlot = VERTEX_ATTRIB_START_SLOT + attrib;
-            if (swr_vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION)
-               outSlot--;
-         }
-
-         WriteVS(val, pVsCtx, vtxOutput, outSlot, channel);
-      }
-   }
-
-   if (ctx->rasterizer->clip_plane_enable ||
-       swr_vs->info.base.culldist_writemask) {
-      unsigned clip_mask = ctx->rasterizer->clip_plane_enable;
-
-      unsigned cv = 0;
-      if (swr_vs->info.base.writes_clipvertex) {
-         cv = locate_linkage(TGSI_SEMANTIC_CLIPVERTEX, 0,
-                             &swr_vs->info.base);
-      } else {
-         for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
-            if (swr_vs->info.base.output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
-                swr_vs->info.base.output_semantic_index[i] == 0) {
-               cv = i;
-               break;
-            }
-         }
-      }
-      assert(cv < PIPE_MAX_SHADER_OUTPUTS);
-      LLVMValueRef cx = LLVMBuildLoad(gallivm->builder, outputs[cv][0], "");
-      LLVMValueRef cy = LLVMBuildLoad(gallivm->builder, outputs[cv][1], "");
-      LLVMValueRef cz = LLVMBuildLoad(gallivm->builder, outputs[cv][2], "");
-      LLVMValueRef cw = LLVMBuildLoad(gallivm->builder, outputs[cv][3], "");
-
-      tgsi_shader_info *pLastFE = &ctx->vs->info.base;
-
-      if (ctx->gs) {
-         pLastFE = &ctx->gs->info.base;
-      }
-      else if (ctx->tes) {
-         pLastFE = &ctx->tes->info.base;
-      }
-      else if (ctx->tcs) {
-         pLastFE = &ctx->tcs->info.base;
-      }
-
-      for (unsigned val = 0; val < PIPE_MAX_CLIP_PLANES; val++) {
-         // clip distance overrides user clip planes
-         if ((pLastFE->clipdist_writemask & clip_mask & (1 << val)) ||
-             ((pLastFE->culldist_writemask << pLastFE->num_written_clipdistance) & (1 << val))) {
-            unsigned cv = locate_linkage(TGSI_SEMANTIC_CLIPDIST, val < 4 ? 0 : 1, pLastFE);
-            assert(cv < PIPE_MAX_SHADER_OUTPUTS);
-            if (val < 4) {
-               LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val], "");
-               WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val);
-            } else {
-               LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val - 4], "");
-               WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4);
-            }
-            continue;
-         }
-
-         if (!(clip_mask & (1 << val)))
-            continue;
-
-         Value *px = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 0}));
-         Value *py = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 1}));
-         Value *pz = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 2}));
-         Value *pw = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 3}));
-#if USE_SIMD16_VS
-         Value *bpx = VBROADCAST_16(px);
-         Value *bpy = VBROADCAST_16(py);
-         Value *bpz = VBROADCAST_16(pz);
-         Value *bpw = VBROADCAST_16(pw);
-#else
-         Value *bpx = VBROADCAST(px);
-         Value *bpy = VBROADCAST(py);
-         Value *bpz = VBROADCAST(pz);
-         Value *bpw = VBROADCAST(pw);
-#endif
-         Value *dist = FADD(FMUL(unwrap(cx), bpx),
-                            FADD(FMUL(unwrap(cy), bpy),
-                                 FADD(FMUL(unwrap(cz), bpz),
-                                      FMUL(unwrap(cw), bpw))));
-
-         if (val < 4)
-            WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val);
-         else
-            WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4);
-      }
-   }
-
-   RET_VOID();
-
-   JM()->DumpToFile(pFunction, "vs_function1");
-   gallivm_verify_function(gallivm, wrap(pFunction));
-   gallivm_compile_module(gallivm);
-   JM()->DumpToFile(pFunction, "vs_function2");
-
-   //   lp_debug_dump_value(func);
-
-   PFN_VERTEX_FUNC pFunc =
-      (PFN_VERTEX_FUNC)gallivm_jit_function(gallivm, wrap(pFunction));
-
-   JM()->DumpAsm(pFunction, "vs_function_asm");
-   debug_printf("vert shader  %p\n", pFunc);
-   assert(pFunc && "Error: VertShader = NULL");
-
-   JM()->mIsModuleFinalized = true;
-
-   return pFunc;
-}
-
-PFN_VERTEX_FUNC
-swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key)
-{
-   if (!ctx->vs->pipe.tokens)
-      return NULL;
-
-   BuilderSWR builder(
-      reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr),
-      "VS");
-   PFN_VERTEX_FUNC func = builder.CompileVS(ctx, key);
-
-   ctx->vs->map.insert(std::make_pair(key, std::unique_ptr<VariantVS>(new VariantVS(builder.gallivm, func))));
-   return func;
-}
-
-unsigned
-swr_so_adjust_attrib(unsigned in_attrib,
-                     swr_vertex_shader *swr_vs)
-{
-   ubyte semantic_name;
-   unsigned attrib;
-
-   attrib = in_attrib + VERTEX_ATTRIB_START_SLOT;
-
-   if (swr_vs) {
-      semantic_name = swr_vs->info.base.output_semantic_name[in_attrib];
-      if (semantic_name == TGSI_SEMANTIC_POSITION) {
-         attrib = VERTEX_POSITION_SLOT;
-      } else if (semantic_name == TGSI_SEMANTIC_PSIZE) {
-         attrib = VERTEX_SGV_SLOT;
-      } else if (semantic_name == TGSI_SEMANTIC_LAYER) {
-         attrib = VERTEX_SGV_SLOT;
-      } else {
-         if (swr_vs->info.base.writes_position) {
-               attrib--;
-         }
-      }
-   }
-
-   return attrib;
-}
-
-static unsigned
-locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info)
-{
-   for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
-      if ((info->output_semantic_name[i] == name)
-          && (info->output_semantic_index[i] == index)) {
-         return i;
-      }
-   }
-
-   return 0xFFFFFFFF;
-}
-
-PFN_PIXEL_KERNEL
-BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key)
-{
-   struct swr_fragment_shader *swr_fs = ctx->fs;
-
-   struct tgsi_shader_info *pPrevShader;
-   if (ctx->gs)
-      pPrevShader = &ctx->gs->info.base;
-   else if (ctx->tes)
-      pPrevShader = &ctx->tes->info.base;
-   else
-      pPrevShader = &ctx->vs->info.base;
-
-   LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
-   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
-
-   memset(inputs, 0, sizeof(inputs));
-   memset(outputs, 0, sizeof(outputs));
-
-   struct lp_build_sampler_soa *sampler = NULL;
-
-   AttrBuilder attrBuilder;
-   attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-
-   std::vector<Type *> fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
-                              PointerType::get(mInt8Ty, 0),
-                              PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)};
-   FunctionType *funcType =
-      FunctionType::get(Type::getVoidTy(JM()->mContext), fsArgs, false);
-
-   auto pFunction = Function::Create(funcType,
-                                     GlobalValue::ExternalLinkage,
-                                     "FS",
-                                     JM()->mpCurrentModule);
-#if LLVM_VERSION_MAJOR < 5
-   AttributeSet attrSet = AttributeSet::get(
-      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
-   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
-#else
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
-#endif
-
-   BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
-   IRB()->SetInsertPoint(block);
-   LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block));
-
-   auto args = pFunction->arg_begin();
-   Value *hPrivateData = &*args++;
-   hPrivateData->setName("hPrivateData");
-   Value *pWorkerData = &*args++;
-   pWorkerData->setName("pWorkerData");
-   Value *pPS = &*args++;
-   pPS->setName("psCtx");
-
-   Value *consts_ptr = GEP(hPrivateData, {0, swr_draw_context_constantFS});
-   consts_ptr->setName("fs_constants");
-   Value *const_sizes_ptr =
-      GEP(hPrivateData, {0, swr_draw_context_num_constantsFS});
-   const_sizes_ptr->setName("num_fs_constants");
-
-   // load *pAttribs, *pPerspAttribs
-   Value *pRawAttribs = LOAD(pPS, {0, SWR_PS_CONTEXT_pAttribs}, "pRawAttribs");
-   Value *pPerspAttribs =
-      LOAD(pPS, {0, SWR_PS_CONTEXT_pPerspAttribs}, "pPerspAttribs");
-
-   swr_fs->constantMask = 0;
-   swr_fs->flatConstantMask = 0;
-   swr_fs->pointSpriteMask = 0;
-
-   for (int attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) {
-      const unsigned mask = swr_fs->info.base.input_usage_mask[attrib];
-      const unsigned interpMode = swr_fs->info.base.input_interpolate[attrib];
-      const unsigned interpLoc = swr_fs->info.base.input_interpolate_loc[attrib];
-
-      if (!mask)
-         continue;
-
-      // load i,j
-      Value *vi = nullptr, *vj = nullptr;
-      switch (interpLoc) {
-      case TGSI_INTERPOLATE_LOC_CENTER:
-         vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_center}, "i");
-         vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_center}, "j");
-         break;
-      case TGSI_INTERPOLATE_LOC_CENTROID:
-         vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_centroid}, "i");
-         vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_centroid}, "j");
-         break;
-      case TGSI_INTERPOLATE_LOC_SAMPLE:
-         vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_sample}, "i");
-         vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_sample}, "j");
-         break;
-      }
-
-      // load/compute w
-      Value *vw = nullptr, *pAttribs;
-      if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE ||
-          interpMode == TGSI_INTERPOLATE_COLOR) {
-         pAttribs = pPerspAttribs;
-         switch (interpLoc) {
-         case TGSI_INTERPOLATE_LOC_CENTER:
-            vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center}));
-            break;
-         case TGSI_INTERPOLATE_LOC_CENTROID:
-            vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_centroid}));
-            break;
-         case TGSI_INTERPOLATE_LOC_SAMPLE:
-            vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_sample}));
-            break;
-         }
-      } else {
-         pAttribs = pRawAttribs;
-         vw = VIMMED1(1.f);
-      }
-
-      vw->setName("w");
-
-      ubyte semantic_name = swr_fs->info.base.input_semantic_name[attrib];
-      ubyte semantic_idx = swr_fs->info.base.input_semantic_index[attrib];
-
-      if (semantic_name == TGSI_SEMANTIC_FACE) {
-         Value *ff =
-            UI_TO_FP(LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), mFP32Ty);
-         ff = FSUB(FMUL(ff, C(2.0f)), C(1.0f));
-         ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vFrontFace");
-
-         inputs[attrib][0] = wrap(ff);
-         inputs[attrib][1] = wrap(VIMMED1(0.0f));
-         inputs[attrib][2] = wrap(VIMMED1(0.0f));
-         inputs[attrib][3] = wrap(VIMMED1(1.0f));
-         continue;
-      } else if (semantic_name == TGSI_SEMANTIC_POSITION) { // gl_FragCoord
-         if (swr_fs->info.base.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] ==
-             TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER) {
-            inputs[attrib][0] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vX, PixelPositions_center}, "vX"));
-            inputs[attrib][1] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vY, PixelPositions_center}, "vY"));
-         } else {
-            inputs[attrib][0] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vX, PixelPositions_UL}, "vX"));
-            inputs[attrib][1] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vY, PixelPositions_UL}, "vY"));
-         }
-         inputs[attrib][2] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vZ}, "vZ"));
-         inputs[attrib][3] =
-            wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center}, "vOneOverW"));
-         continue;
-      } else if (semantic_name == TGSI_SEMANTIC_LAYER) { // gl_Layer
-         Value *ff = LOAD(pPS, {0, SWR_PS_CONTEXT_renderTargetArrayIndex});
-         ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vRenderTargetArrayIndex");
-         inputs[attrib][0] = wrap(ff);
-         inputs[attrib][1] = wrap(VIMMED1(0.0f));
-         inputs[attrib][2] = wrap(VIMMED1(0.0f));
-         inputs[attrib][3] = wrap(VIMMED1(0.0f));
-         continue;
-      } else if (semantic_name == TGSI_SEMANTIC_VIEWPORT_INDEX) { // gl_ViewportIndex
-         Value *ff = LOAD(pPS, {0, SWR_PS_CONTEXT_viewportIndex});
-         ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vViewportIndex");
-         inputs[attrib][0] = wrap(ff);
-         inputs[attrib][1] = wrap(VIMMED1(0.0f));
-         inputs[attrib][2] = wrap(VIMMED1(0.0f));
-         inputs[attrib][3] = wrap(VIMMED1(0.0f));
-         continue;
-      }
-      unsigned linkedAttrib =
-         locate_linkage(semantic_name, semantic_idx, pPrevShader) - 1;
-
-      uint32_t extraAttribs = 0;
-      if (semantic_name == TGSI_SEMANTIC_PRIMID && !ctx->gs) {
-         /* non-gs generated primID - need to grab from swizzleMap override */
-         linkedAttrib = pPrevShader->num_outputs - 1;
-         swr_fs->constantMask |= 1 << linkedAttrib;
-         extraAttribs++;
-      } else if (semantic_name == TGSI_SEMANTIC_GENERIC &&
-          key.sprite_coord_enable & (1 << semantic_idx)) {
-         /* we add an extra attrib to the backendState in swr_update_derived. */
-         linkedAttrib = pPrevShader->num_outputs + extraAttribs - 1;
-         swr_fs->pointSpriteMask |= (1 << linkedAttrib);
-         extraAttribs++;
-      } else if (linkedAttrib + 1 == 0xFFFFFFFF) {
-         inputs[attrib][0] = wrap(VIMMED1(0.0f));
-         inputs[attrib][1] = wrap(VIMMED1(0.0f));
-         inputs[attrib][2] = wrap(VIMMED1(0.0f));
-         inputs[attrib][3] = wrap(VIMMED1(1.0f));
-         /* If we're reading in color and 2-sided lighting is enabled, we have
-          * to keep going.
-          */
-         if (semantic_name != TGSI_SEMANTIC_COLOR || !key.light_twoside)
-            continue;
-      } else {
-         if (interpMode == TGSI_INTERPOLATE_CONSTANT) {
-            swr_fs->constantMask |= 1 << linkedAttrib;
-         } else if (interpMode == TGSI_INTERPOLATE_COLOR) {
-            swr_fs->flatConstantMask |= 1 << linkedAttrib;
-         }
-      }
-
-      unsigned bcolorAttrib = 0xFFFFFFFF;
-      Value *offset = NULL;
-      if (semantic_name == TGSI_SEMANTIC_COLOR && key.light_twoside) {
-         bcolorAttrib = locate_linkage(
-               TGSI_SEMANTIC_BCOLOR, semantic_idx, pPrevShader);
-         /* Neither front nor back colors were available. Nothing to load. */
-         if (bcolorAttrib == 0xFFFFFFFF && linkedAttrib == 0xFFFFFFFF)
-            continue;
-         /* If there is no front color, just always use the back color. */
-         if (linkedAttrib + 1 == 0xFFFFFFFF)
-            linkedAttrib = bcolorAttrib;
-
-         if (bcolorAttrib != 0xFFFFFFFF) {
-            bcolorAttrib -= 1;
-            if (interpMode == TGSI_INTERPOLATE_CONSTANT) {
-               swr_fs->constantMask |= 1 << bcolorAttrib;
-            } else if (interpMode == TGSI_INTERPOLATE_COLOR) {
-               swr_fs->flatConstantMask |= 1 << bcolorAttrib;
-            }
-
-            unsigned diff = 12 * (bcolorAttrib - linkedAttrib);
-
-            if (diff) {
-               Value *back =
-                  XOR(C(1), LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), "backFace");
-
-               offset = MUL(back, C(diff));
-               offset->setName("offset");
-            }
-         }
-      }
-
-      for (int channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
-         if (mask & (1 << channel)) {
-            Value *indexA = C(linkedAttrib * 12 + channel);
-            Value *indexB = C(linkedAttrib * 12 + channel + 4);
-            Value *indexC = C(linkedAttrib * 12 + channel + 8);
-
-            if (offset) {
-               indexA = ADD(indexA, offset);
-               indexB = ADD(indexB, offset);
-               indexC = ADD(indexC, offset);
-            }
-
-            Value *va = VBROADCAST(LOAD(GEP(pAttribs, indexA)));
-            Value *vb = VBROADCAST(LOAD(GEP(pAttribs, indexB)));
-            Value *vc = VBROADCAST(LOAD(GEP(pAttribs, indexC)));
-
-            if (interpMode == TGSI_INTERPOLATE_CONSTANT) {
-               inputs[attrib][channel] = wrap(va);
-            } else {
-               Value *vk = FSUB(FSUB(VIMMED1(1.0f), vi), vj);
-
-               vc = FMUL(vk, vc);
-
-               Value *interp = FMUL(va, vi);
-               Value *interp1 = FMUL(vb, vj);
-               interp = FADD(interp, interp1);
-               interp = FADD(interp, vc);
-               if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE ||
-                   interpMode == TGSI_INTERPOLATE_COLOR)
-                  interp = FMUL(interp, vw);
-               inputs[attrib][channel] = wrap(interp);
-            }
-         }
-      }
-   }
-
-   sampler = swr_sampler_soa_create(key.sampler, PIPE_SHADER_FRAGMENT);
-   assert(sampler != nullptr);
-
-   struct lp_bld_tgsi_system_values system_values;
-   memset(&system_values, 0, sizeof(system_values));
-
-   struct lp_build_mask_context mask;
-   bool uses_mask = false;
-
-   if (swr_fs->info.base.uses_kill ||
-       key.poly_stipple_enable) {
-      Value *vActiveMask = NULL;
-      if (swr_fs->info.base.uses_kill) {
-         vActiveMask = LOAD(pPS, {0, SWR_PS_CONTEXT_activeMask}, "activeMask");
-      }
-      if (key.poly_stipple_enable) {
-         // first get fragment xy coords and clip to stipple bounds
-         Value *vXf = LOAD(pPS, {0, SWR_PS_CONTEXT_vX, PixelPositions_UL});
-         Value *vYf = LOAD(pPS, {0, SWR_PS_CONTEXT_vY, PixelPositions_UL});
-         Value *vXu = FP_TO_UI(vXf, mSimdInt32Ty);
-         Value *vYu = FP_TO_UI(vYf, mSimdInt32Ty);
-
-         // stipple pattern is 32x32, which means that one line of stipple
-         // is stored in one word:
-         // vXstipple is bit offset inside 32-bit stipple word
-         // vYstipple is word index is stipple array
-         Value *vXstipple = AND(vXu, VIMMED1(0x1f)); // & (32-1)
-         Value *vYstipple = AND(vYu, VIMMED1(0x1f)); // & (32-1)
-
-         // grab stipple pattern base address
-         Value *stipplePtr = GEP(hPrivateData, {0, swr_draw_context_polyStipple, 0});
-         stipplePtr = BITCAST(stipplePtr, mInt8PtrTy);
-
-         // peform a gather to grab stipple words for each lane
-         Value *vStipple = GATHERDD(VUNDEF_I(), stipplePtr, vYstipple,
-                                    VIMMED1(0xffffffff), 4);
-
-         // create a mask with one bit corresponding to the x stipple
-         // and AND it with the pattern, to see if we have a bit
-         Value *vBitMask = LSHR(VIMMED1(0x80000000), vXstipple);
-         Value *vStippleMask = AND(vStipple, vBitMask);
-         vStippleMask = ICMP_NE(vStippleMask, VIMMED1(0));
-         vStippleMask = VMASK(vStippleMask);
-
-         if (swr_fs->info.base.uses_kill) {
-            vActiveMask = AND(vActiveMask, vStippleMask);
-         } else {
-            vActiveMask = vStippleMask;
-         }
-      }
-      lp_build_mask_begin(
-         &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(vActiveMask));
-      uses_mask = true;
-   }
-
-   struct lp_build_tgsi_params params;
-   memset(&params, 0, sizeof(params));
-   params.type = lp_type_float_vec(32, 32 * 8);
-   params.mask = uses_mask ? &mask : NULL;
-   params.consts_ptr = wrap(consts_ptr);
-   params.const_sizes_ptr = wrap(const_sizes_ptr);
-   params.system_values = &system_values;
-   params.inputs = inputs;
-   params.context_ptr = wrap(hPrivateData);
-   params.sampler = sampler;
-   params.info = &swr_fs->info.base;
-
-   lp_build_tgsi_soa(gallivm,
-                     swr_fs->pipe.tokens,
-                     &params,
-                     outputs);
-
-   sampler->destroy(sampler);
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   for (uint32_t attrib = 0; attrib < swr_fs->info.base.num_outputs;
-        attrib++) {
-      switch (swr_fs->info.base.output_semantic_name[attrib]) {
-      case TGSI_SEMANTIC_POSITION: {
-         // write z
-         LLVMValueRef outZ =
-            LLVMBuildLoad(gallivm->builder, outputs[attrib][2], "");
-         STORE(unwrap(outZ), pPS, {0, SWR_PS_CONTEXT_vZ});
-         break;
-      }
-      case TGSI_SEMANTIC_COLOR: {
-         for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
-            if (!outputs[attrib][channel])
-               continue;
-
-            LLVMValueRef out =
-               LLVMBuildLoad(gallivm->builder, outputs[attrib][channel], "");
-            if (swr_fs->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
-                swr_fs->info.base.output_semantic_index[attrib] == 0) {
-               for (uint32_t rt = 0; rt < key.nr_cbufs; rt++) {
-                  STORE(unwrap(out),
-                        pPS,
-                        {0, SWR_PS_CONTEXT_shaded, rt, channel});
-               }
-            } else {
-               STORE(unwrap(out),
-                     pPS,
-                     {0,
-                           SWR_PS_CONTEXT_shaded,
-                           swr_fs->info.base.output_semantic_index[attrib],
-                           channel});
-            }
-         }
-         break;
-      }
-      default: {
-         fprintf(stderr,
-                 "unknown output from FS %s[%d]\n",
-                 tgsi_semantic_names[swr_fs->info.base
-                                        .output_semantic_name[attrib]],
-                 swr_fs->info.base.output_semantic_index[attrib]);
-         break;
-      }
-      }
-   }
-
-   LLVMValueRef mask_result = 0;
-   if (uses_mask) {
-      mask_result = lp_build_mask_end(&mask);
-   }
-
-   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-
-   if (uses_mask) {
-      STORE(unwrap(mask_result), pPS, {0, SWR_PS_CONTEXT_activeMask});
-   }
-
-   RET_VOID();
-
-   gallivm_verify_function(gallivm, wrap(pFunction));
-
-   gallivm_compile_module(gallivm);
-
-   // after the gallivm passes, we have to lower the core's intrinsics
-   llvm::legacy::FunctionPassManager lowerPass(JM()->mpCurrentModule);
-   lowerPass.add(createLowerX86Pass(this));
-   lowerPass.run(*pFunction);
-
-   PFN_PIXEL_KERNEL kernel =
-      (PFN_PIXEL_KERNEL)gallivm_jit_function(gallivm, wrap(pFunction));
-   debug_printf("frag shader  %p\n", kernel);
-   assert(kernel && "Error: FragShader = NULL");
-
-   JM()->mIsModuleFinalized = true;
-
-   return kernel;
-}
-
-PFN_PIXEL_KERNEL
-swr_compile_fs(struct swr_context *ctx, swr_jit_fs_key &key)
-{
-   if (!ctx->fs->pipe.tokens)
-      return NULL;
-
-   BuilderSWR builder(
-      reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr),
-      "FS");
-   PFN_PIXEL_KERNEL func = builder.CompileFS(ctx, key);
-
-   ctx->fs->map.insert(std::make_pair(key, std::unique_ptr<VariantFS>(new VariantFS(builder.gallivm, func))));
-   return func;
-}
diff --git a/src/gallium/drivers/swr/swr_shader.h b/src/gallium/drivers/swr/swr_shader.h
deleted file mode 100644
index cabe915f312..00000000000
--- a/src/gallium/drivers/swr/swr_shader.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#pragma once
-
-struct swr_vertex_shader;
-struct swr_fragment_shader;
-struct swr_geometry_shader;
-struct swr_tess_control_shader;
-struct swr_tess_evaluation_shader;
-
-struct swr_jit_fs_key;
-struct swr_jit_vs_key;
-struct swr_jit_gs_key;
-struct swr_jit_tcs_key;
-struct swr_jit_tes_key;
-
-using PFN_TCS_FUNC = PFN_HS_FUNC;
-using PFN_TES_FUNC = PFN_DS_FUNC;
-
-unsigned swr_so_adjust_attrib(unsigned in_attrib,
-                              swr_vertex_shader *swr_vs);
-
-PFN_VERTEX_FUNC
-swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key);
-
-PFN_PIXEL_KERNEL
-swr_compile_fs(struct swr_context *ctx, swr_jit_fs_key &key);
-
-PFN_GS_FUNC
-swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key &key);
-
-PFN_TCS_FUNC
-swr_compile_tcs(struct swr_context *ctx, swr_jit_tcs_key &key);
-
-PFN_TES_FUNC
-swr_compile_tes(struct swr_context *ctx, swr_jit_tes_key &key);
-
-void swr_generate_fs_key(struct swr_jit_fs_key &key,
-                         struct swr_context *ctx,
-                         swr_fragment_shader *swr_fs);
-
-void swr_generate_vs_key(struct swr_jit_vs_key &key,
-                         struct swr_context *ctx,
-                         swr_vertex_shader *swr_vs);
-
-void swr_generate_fetch_key(struct swr_jit_fetch_key &key,
-                            struct swr_vertex_element_state *velems);
-
-void swr_generate_gs_key(struct swr_jit_gs_key &key,
-                         struct swr_context *ctx,
-                         swr_geometry_shader *swr_gs);
-
-void swr_generate_tcs_key(struct swr_jit_tcs_key &key,
-                          struct swr_context *ctx,
-                          swr_tess_control_shader *swr_tcs);
-
-void swr_generate_tes_key(struct swr_jit_tes_key &key,
-                          struct swr_context *ctx,
-                          swr_tess_evaluation_shader *swr_tes);
-
-struct swr_jit_sampler_key {
-   unsigned nr_samplers;
-   unsigned nr_sampler_views;
-   struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-};
-
-struct swr_jit_fs_key : swr_jit_sampler_key {
-   unsigned nr_cbufs;
-   unsigned light_twoside;
-   unsigned sprite_coord_enable;
-   ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
-   ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS];
-   bool poly_stipple_enable;
-};
-
-struct swr_jit_vs_key : swr_jit_sampler_key {
-   unsigned clip_plane_mask; // from rasterizer state & vs_info
-};
-
-struct swr_jit_fetch_key {
-   FETCH_COMPILE_STATE fsState;
-};
-
-struct swr_jit_gs_key : swr_jit_sampler_key {
-   ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
-   ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS];
-};
-
-// TESS_TODO: revisit this - we probably need to use
-// primitive modes, number of vertices emitted, etc.
-struct swr_jit_tcs_key : swr_jit_sampler_key {
-   ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
-   ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS];
-   unsigned clip_plane_mask; // from rasterizer state & tcs_info
-};
-
-// TESS_TODO: revisit this
-struct swr_jit_tes_key : swr_jit_sampler_key {
-   ubyte prev_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
-   ubyte prev_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS];
-   unsigned clip_plane_mask; // from rasterizer state & tes_info
-};
-
-namespace std
-{
-template <> struct hash<swr_jit_fs_key> {
-   std::size_t operator()(const swr_jit_fs_key &k) const
-   {
-      return util_hash_crc32(&k, sizeof(k));
-   }
-};
-
-template <> struct hash<swr_jit_vs_key> {
-   std::size_t operator()(const swr_jit_vs_key &k) const
-   {
-      return util_hash_crc32(&k, sizeof(k));
-   }
-};
-
-template <> struct hash<swr_jit_fetch_key> {
-   std::size_t operator()(const swr_jit_fetch_key &k) const
-   {
-      return util_hash_crc32(&k, sizeof(k));
-   }
-};
-
-template <> struct hash<swr_jit_gs_key> {
-   std::size_t operator()(const swr_jit_gs_key &k) const
-   {
-      return util_hash_crc32(&k, sizeof(k));
-   }
-};
-
-template <> struct hash<swr_jit_tcs_key> {
-   std::size_t operator()(const swr_jit_tcs_key &k) const
-   {
-      return util_hash_crc32(&k, sizeof(k));
-   }
-};
-
-template <> struct hash<swr_jit_tes_key> {
-   std::size_t operator()(const swr_jit_tes_key &k) const
-   {
-      return util_hash_crc32(&k, sizeof(k));
-   }
-};
-};
-
-bool operator==(const swr_jit_fs_key &lhs, const swr_jit_fs_key &rhs);
-bool operator==(const swr_jit_vs_key &lhs, const swr_jit_vs_key &rhs);
-bool operator==(const swr_jit_fetch_key &lhs, const swr_jit_fetch_key &rhs);
-bool operator==(const swr_jit_gs_key &lhs, const swr_jit_gs_key &rhs);
-bool operator==(const swr_jit_tcs_key &lhs, const swr_jit_tcs_key &rhs);
-bool operator==(const swr_jit_tes_key &lhs, const swr_jit_tes_key &rhs);
diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp
deleted file mode 100644
index 5f1464e6d0e..00000000000
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ /dev/null
@@ -1,2243 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#include <llvm/Config/llvm-config.h>
-
-#if LLVM_VERSION_MAJOR < 7
-// llvm redefines DEBUG
-#pragma push_macro("DEBUG")
-#undef DEBUG
-#endif
-
-#include <rasterizer/core/state.h>
-#include "JitManager.h"
-
-#if LLVM_VERSION_MAJOR < 7
-#pragma pop_macro("DEBUG")
-#endif
-
-#include "common/os.h"
-#include "jit_api.h"
-#include "gen_state_llvm.h"
-#include "core/multisample.h"
-#include "core/state_funcs.h"
-
-#include "gallivm/lp_bld_tgsi.h"
-#include "util/format/u_format.h"
-
-#include "util/u_memory.h"
-#include "util/u_inlines.h"
-#include "util/u_helpers.h"
-#include "util/u_framebuffer.h"
-#include "util/u_viewport.h"
-#include "util/u_prim.h"
-
-#include "swr_state.h"
-#include "swr_context.h"
-#include "gen_surf_state_llvm.h"
-#include "gen_swr_context_llvm.h"
-#include "swr_screen.h"
-#include "swr_resource.h"
-#include "swr_tex_sample.h"
-#include "swr_scratch.h"
-#include "swr_shader.h"
-#include "swr_fence.h"
-
-/* These should be pulled out into separate files as necessary
- * Just initializing everything here to get going. */
-
-static void *
-swr_create_blend_state(struct pipe_context *pipe,
-                       const struct pipe_blend_state *blend)
-{
-   struct swr_blend_state *state = CALLOC_STRUCT(swr_blend_state);
-   assert(state != nullptr);
-
-   memcpy(&state->pipe, blend, sizeof(*blend));
-
-   struct pipe_blend_state *pipe_blend = &state->pipe;
-
-   for (int target = 0;
-        target < std::min(SWR_NUM_RENDERTARGETS, PIPE_MAX_COLOR_BUFS);
-        target++) {
-
-      struct pipe_rt_blend_state *rt_blend = &pipe_blend->rt[target];
-      SWR_RENDER_TARGET_BLEND_STATE &blendState =
-         state->blendState.renderTarget[target];
-      RENDER_TARGET_BLEND_COMPILE_STATE &compileState =
-         state->compileState[target];
-
-      if (target != 0 && !pipe_blend->independent_blend_enable) {
-         memcpy(&compileState,
-                &state->compileState[0],
-                sizeof(RENDER_TARGET_BLEND_COMPILE_STATE));
-         continue;
-      }
-
-      compileState.blendEnable = rt_blend->blend_enable;
-      if (compileState.blendEnable) {
-         compileState.sourceAlphaBlendFactor =
-            swr_convert_blend_factor(rt_blend->alpha_src_factor);
-         compileState.destAlphaBlendFactor =
-            swr_convert_blend_factor(rt_blend->alpha_dst_factor);
-         compileState.sourceBlendFactor =
-            swr_convert_blend_factor(rt_blend->rgb_src_factor);
-         compileState.destBlendFactor =
-            swr_convert_blend_factor(rt_blend->rgb_dst_factor);
-
-         compileState.colorBlendFunc =
-            swr_convert_blend_func(rt_blend->rgb_func);
-         compileState.alphaBlendFunc =
-            swr_convert_blend_func(rt_blend->alpha_func);
-      }
-      compileState.logicOpEnable = state->pipe.logicop_enable;
-      if (compileState.logicOpEnable) {
-         compileState.logicOpFunc =
-            swr_convert_logic_op(state->pipe.logicop_func);
-      }
-
-      blendState.writeDisableRed =
-         (rt_blend->colormask & PIPE_MASK_R) ? 0 : 1;
-      blendState.writeDisableGreen =
-         (rt_blend->colormask & PIPE_MASK_G) ? 0 : 1;
-      blendState.writeDisableBlue =
-         (rt_blend->colormask & PIPE_MASK_B) ? 0 : 1;
-      blendState.writeDisableAlpha =
-         (rt_blend->colormask & PIPE_MASK_A) ? 0 : 1;
-
-      if (rt_blend->colormask == 0)
-         compileState.blendEnable = false;
-   }
-
-   return state;
-}
-
-static void
-swr_bind_blend_state(struct pipe_context *pipe, void *blend)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (ctx->blend == blend)
-      return;
-
-   ctx->blend = (swr_blend_state *)blend;
-
-   ctx->dirty |= SWR_NEW_BLEND;
-}
-
-static void
-swr_delete_blend_state(struct pipe_context *pipe, void *blend)
-{
-   FREE(blend);
-}
-
-static void
-swr_set_blend_color(struct pipe_context *pipe,
-                    const struct pipe_blend_color *color)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   ctx->blend_color = *color;
-
-   ctx->dirty |= SWR_NEW_BLEND;
-}
-
-static void
-swr_set_stencil_ref(struct pipe_context *pipe,
-                    const struct pipe_stencil_ref ref)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   ctx->stencil_ref = ref;
-
-   ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA;
-}
-
-static void *
-swr_create_depth_stencil_state(
-   struct pipe_context *pipe,
-   const struct pipe_depth_stencil_alpha_state *depth_stencil)
-{
-   struct pipe_depth_stencil_alpha_state *state;
-
-   state = (pipe_depth_stencil_alpha_state *)mem_dup(depth_stencil,
-                                                     sizeof *depth_stencil);
-
-   return state;
-}
-
-static void
-swr_bind_depth_stencil_state(struct pipe_context *pipe, void *depth_stencil)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (ctx->depth_stencil == (pipe_depth_stencil_alpha_state *)depth_stencil)
-      return;
-
-   ctx->depth_stencil = (pipe_depth_stencil_alpha_state *)depth_stencil;
-
-   ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA;
-}
-
-static void
-swr_delete_depth_stencil_state(struct pipe_context *pipe, void *depth)
-{
-   FREE(depth);
-}
-
-
-static void *
-swr_create_rasterizer_state(struct pipe_context *pipe,
-                            const struct pipe_rasterizer_state *rast)
-{
-   struct pipe_rasterizer_state *state;
-   state = (pipe_rasterizer_state *)mem_dup(rast, sizeof *rast);
-
-   return state;
-}
-
-static void
-swr_bind_rasterizer_state(struct pipe_context *pipe, void *handle)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   const struct pipe_rasterizer_state *rasterizer =
-      (const struct pipe_rasterizer_state *)handle;
-
-   if (ctx->rasterizer == (pipe_rasterizer_state *)rasterizer)
-      return;
-
-   ctx->rasterizer = (pipe_rasterizer_state *)rasterizer;
-
-   ctx->dirty |= SWR_NEW_RASTERIZER;
-}
-
-static void
-swr_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer)
-{
-   FREE(rasterizer);
-}
-
-
-static void *
-swr_create_sampler_state(struct pipe_context *pipe,
-                         const struct pipe_sampler_state *sampler)
-{
-   struct pipe_sampler_state *state =
-      (pipe_sampler_state *)mem_dup(sampler, sizeof *sampler);
-
-   return state;
-}
-
-static void
-swr_bind_sampler_states(struct pipe_context *pipe,
-                        enum pipe_shader_type shader,
-                        unsigned start,
-                        unsigned num,
-                        void **samplers)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   unsigned i;
-
-   assert(shader < PIPE_SHADER_TYPES);
-   assert(start + num <= ARRAY_SIZE(ctx->samplers[shader]));
-
-   /* set the new samplers */
-   ctx->num_samplers[shader] = num;
-   for (i = 0; i < num; i++) {
-      ctx->samplers[shader][start + i] = (pipe_sampler_state *)samplers[i];
-   }
-
-   ctx->dirty |= SWR_NEW_SAMPLER;
-}
-
-static void
-swr_delete_sampler_state(struct pipe_context *pipe, void *sampler)
-{
-   FREE(sampler);
-}
-
-
-static struct pipe_sampler_view *
-swr_create_sampler_view(struct pipe_context *pipe,
-                        struct pipe_resource *texture,
-                        const struct pipe_sampler_view *templ)
-{
-   struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
-
-   if (view) {
-      *view = *templ;
-      view->reference.count = 1;
-      view->texture = NULL;
-      pipe_resource_reference(&view->texture, texture);
-      view->context = pipe;
-   }
-
-   return view;
-}
-
-static void
-swr_set_sampler_views(struct pipe_context *pipe,
-                      enum pipe_shader_type shader,
-                      unsigned start,
-                      unsigned num,
-                      unsigned unbind_num_trailing_slots,
-                      bool take_ownership,
-                      struct pipe_sampler_view **views)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   uint i;
-
-   assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS);
-
-   assert(shader < PIPE_SHADER_TYPES);
-   assert(start + num <= ARRAY_SIZE(ctx->sampler_views[shader]));
-
-   /* set the new sampler views */
-   ctx->num_sampler_views[shader] = num;
-   for (i = 0; i < num; i++) {
-      if (take_ownership) {
-         pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i],
-                                     NULL);
-         ctx->sampler_views[shader][start + i] = views[i];
-      } else {
-         pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i],
-                                     views[i]);
-      }
-   }
-   for (; i < num + unbind_num_trailing_slots; i++) {
-      pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i],
-                                  NULL);
-   }
-
-   ctx->dirty |= SWR_NEW_SAMPLER_VIEW;
-}
-
-static void
-swr_sampler_view_destroy(struct pipe_context *pipe,
-                         struct pipe_sampler_view *view)
-{
-   pipe_resource_reference(&view->texture, NULL);
-   FREE(view);
-}
-
-static void *
-swr_create_vs_state(struct pipe_context *pipe,
-                    const struct pipe_shader_state *vs)
-{
-   struct swr_vertex_shader *swr_vs = new swr_vertex_shader;
-   if (!swr_vs)
-      return NULL;
-
-   swr_vs->pipe.tokens = tgsi_dup_tokens(vs->tokens);
-   swr_vs->pipe.stream_output = vs->stream_output;
-
-   lp_build_tgsi_info(vs->tokens, &swr_vs->info);
-
-   swr_vs->soState = {0};
-
-   if (swr_vs->pipe.stream_output.num_outputs) {
-      pipe_stream_output_info *stream_output = &swr_vs->pipe.stream_output;
-
-      swr_vs->soState.soEnable = true;
-      // soState.rasterizerDisable set on state dirty
-      // soState.streamToRasterizer not used
-
-      for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
-         unsigned attrib_slot = stream_output->output[i].register_index;
-         attrib_slot = swr_so_adjust_attrib(attrib_slot, swr_vs);
-         swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
-            (1 << attrib_slot);
-      }
-      for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
-        swr_vs->soState.streamNumEntries[i] =
-             _mm_popcnt_u32(swr_vs->soState.streamMasks[i]);
-       }
-   }
-
-   return swr_vs;
-}
-
-static void
-swr_bind_vs_state(struct pipe_context *pipe, void *vs)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (ctx->vs == vs)
-      return;
-
-   ctx->vs = (swr_vertex_shader *)vs;
-   ctx->dirty |= SWR_NEW_VS;
-}
-
-static void
-swr_delete_vs_state(struct pipe_context *pipe, void *vs)
-{
-   struct swr_vertex_shader *swr_vs = (swr_vertex_shader *)vs;
-   FREE((void *)swr_vs->pipe.tokens);
-   struct swr_screen *screen = swr_screen(pipe->screen);
-
-   /* Defer deletion of vs state */
-   swr_fence_work_delete_vs(screen->flush_fence, swr_vs);
-}
-
-static void *
-swr_create_fs_state(struct pipe_context *pipe,
-                    const struct pipe_shader_state *fs)
-{
-   struct swr_fragment_shader *swr_fs = new swr_fragment_shader;
-   if (!swr_fs)
-      return NULL;
-
-   swr_fs->pipe.tokens = tgsi_dup_tokens(fs->tokens);
-
-   lp_build_tgsi_info(fs->tokens, &swr_fs->info);
-
-   return swr_fs;
-}
-
-
-static void
-swr_bind_fs_state(struct pipe_context *pipe, void *fs)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (ctx->fs == fs)
-      return;
-
-   ctx->fs = (swr_fragment_shader *)fs;
-   ctx->dirty |= SWR_NEW_FS;
-}
-
-static void
-swr_delete_fs_state(struct pipe_context *pipe, void *fs)
-{
-   struct swr_fragment_shader *swr_fs = (swr_fragment_shader *)fs;
-   FREE((void *)swr_fs->pipe.tokens);
-   struct swr_screen *screen = swr_screen(pipe->screen);
-
-   /* Defer deleton of fs state */
-   swr_fence_work_delete_fs(screen->flush_fence, swr_fs);
-}
-
-static void *
-swr_create_gs_state(struct pipe_context *pipe,
-                    const struct pipe_shader_state *gs)
-{
-   struct swr_geometry_shader *swr_gs = new swr_geometry_shader;
-   if (!swr_gs)
-      return NULL;
-
-   swr_gs->pipe.tokens = tgsi_dup_tokens(gs->tokens);
-   lp_build_tgsi_info(gs->tokens, &swr_gs->info);
-   return swr_gs;
-}
-
-static void
-swr_bind_gs_state(struct pipe_context *pipe, void *gs)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (ctx->gs == gs)
-      return;
-
-   ctx->gs = (swr_geometry_shader *)gs;
-   ctx->dirty |= SWR_NEW_GS;
-}
-
-static void
-swr_delete_gs_state(struct pipe_context *pipe, void *gs)
-{
-   struct swr_geometry_shader *swr_gs = (swr_geometry_shader *)gs;
-   FREE((void *)swr_gs->pipe.tokens);
-   struct swr_screen *screen = swr_screen(pipe->screen);
-
-   /* Defer deleton of fs state */
-   swr_fence_work_delete_gs(screen->flush_fence, swr_gs);
-}
-
-static void *
-swr_create_tcs_state(struct pipe_context *pipe,
-                     const struct pipe_shader_state *tcs)
-{
-   struct swr_tess_control_shader *swr_tcs = new swr_tess_control_shader;
-   if (!swr_tcs)
-      return NULL;
-
-   swr_tcs->pipe.tokens = tgsi_dup_tokens(tcs->tokens);
-   lp_build_tgsi_info(tcs->tokens, &swr_tcs->info);
-   return swr_tcs;
-}
-
-static void
-swr_bind_tcs_state(struct pipe_context *pipe, void *tcs)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (ctx->tcs == tcs)
-      return;
-
-   ctx->tcs = (swr_tess_control_shader *)tcs;
-   ctx->dirty |= SWR_NEW_TCS;
-   ctx->dirty |= SWR_NEW_TS;
-}
-
-static void
-swr_delete_tcs_state(struct pipe_context *pipe, void *tcs)
-{
-   struct swr_tess_control_shader *swr_tcs = (swr_tess_control_shader *)tcs;
-   FREE((void *)swr_tcs->pipe.tokens);
-   struct swr_screen *screen = swr_screen(pipe->screen);
-
-   /* Defer deleton of tcs state */
-   swr_fence_work_delete_tcs(screen->flush_fence, swr_tcs);
-}
-
-static void *
-swr_create_tes_state(struct pipe_context *pipe,
-                     const struct pipe_shader_state *tes)
-{
-   struct swr_tess_evaluation_shader *swr_tes = new swr_tess_evaluation_shader;
-   if (!swr_tes)
-      return NULL;
-
-   swr_tes->pipe.tokens = tgsi_dup_tokens(tes->tokens);
-   lp_build_tgsi_info(tes->tokens, &swr_tes->info);
-   return swr_tes;
-}
-
-static void
-swr_bind_tes_state(struct pipe_context *pipe, void *tes)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (ctx->tes == tes)
-      return;
-
-   // Save current tessellator state first
-   if (ctx->tes != nullptr) {
-      ctx->tes->ts_state = ctx->tsState;
-   }
-
-   ctx->tes = (swr_tess_evaluation_shader *)tes;
-
-   ctx->dirty |= SWR_NEW_TES;
-   ctx->dirty |= SWR_NEW_TS;
-}
-
-static void
-swr_delete_tes_state(struct pipe_context *pipe, void *tes)
-{
-   struct swr_tess_evaluation_shader *swr_tes = (swr_tess_evaluation_shader *)tes;
-   FREE((void *)swr_tes->pipe.tokens);
-   struct swr_screen *screen = swr_screen(pipe->screen);
-
-   /* Defer deleton of tes state */
-   swr_fence_work_delete_tes(screen->flush_fence, swr_tes);
-}
-
-static void
-swr_set_constant_buffer(struct pipe_context *pipe,
-                        enum pipe_shader_type shader,
-                        uint index, bool take_ownership,
-                        const struct pipe_constant_buffer *cb)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   struct pipe_resource *constants = cb ? cb->buffer : NULL;
-
-   assert(shader < PIPE_SHADER_TYPES);
-   assert(index < ARRAY_SIZE(ctx->constants[shader]));
-
-   /* note: reference counting */
-   util_copy_constant_buffer(&ctx->constants[shader][index], cb, take_ownership);
-
-   if (shader == PIPE_SHADER_VERTEX) {
-      ctx->dirty |= SWR_NEW_VSCONSTANTS;
-   } else if (shader == PIPE_SHADER_FRAGMENT) {
-      ctx->dirty |= SWR_NEW_FSCONSTANTS;
-   } else if (shader == PIPE_SHADER_GEOMETRY) {
-      ctx->dirty |= SWR_NEW_GSCONSTANTS;
-   } else if (shader == PIPE_SHADER_TESS_CTRL) {
-      ctx->dirty |= SWR_NEW_TCSCONSTANTS;
-   } else if (shader == PIPE_SHADER_TESS_EVAL) {
-      ctx->dirty |= SWR_NEW_TESCONSTANTS;
-   }
-   if (cb && cb->user_buffer) {
-      pipe_resource_reference(&constants, NULL);
-   }
-}
-
-
-static void *
-swr_create_vertex_elements_state(struct pipe_context *pipe,
-                                 unsigned num_elements,
-                                 const struct pipe_vertex_element *attribs)
-{
-   struct swr_vertex_element_state *velems;
-   assert(num_elements <= PIPE_MAX_ATTRIBS);
-   velems = new swr_vertex_element_state;
-   if (velems) {
-      memset((void*)&velems->fsState, 0, sizeof(velems->fsState));
-      velems->fsState.bVertexIDOffsetEnable = true;
-      velems->fsState.numAttribs = num_elements;
-      for (unsigned i = 0; i < num_elements; i++) {
-         // XXX: we should do this keyed on the VS usage info
-
-         const struct util_format_description *desc =
-            util_format_description((enum pipe_format)attribs[i].src_format);
-
-         velems->fsState.layout[i].AlignedByteOffset = attribs[i].src_offset;
-         velems->fsState.layout[i].Format =
-            mesa_to_swr_format((enum pipe_format)attribs[i].src_format);
-         velems->fsState.layout[i].StreamIndex =
-            attribs[i].vertex_buffer_index;
-         velems->fsState.layout[i].InstanceEnable =
-            attribs[i].instance_divisor != 0;
-         velems->fsState.layout[i].ComponentControl0 =
-            desc->channel[0].type != UTIL_FORMAT_TYPE_VOID
-            ? ComponentControl::StoreSrc
-            : ComponentControl::Store0;
-         velems->fsState.layout[i].ComponentControl1 =
-            desc->channel[1].type != UTIL_FORMAT_TYPE_VOID
-            ? ComponentControl::StoreSrc
-            : ComponentControl::Store0;
-         velems->fsState.layout[i].ComponentControl2 =
-            desc->channel[2].type != UTIL_FORMAT_TYPE_VOID
-            ? ComponentControl::StoreSrc
-            : ComponentControl::Store0;
-         velems->fsState.layout[i].ComponentControl3 =
-            desc->channel[3].type != UTIL_FORMAT_TYPE_VOID
-            ? ComponentControl::StoreSrc
-            : ComponentControl::Store1Fp;
-         velems->fsState.layout[i].ComponentPacking = ComponentEnable::XYZW;
-         velems->fsState.layout[i].InstanceAdvancementState =
-            attribs[i].instance_divisor;
-
-         /* Calculate the pitch of each stream */
-         const SWR_FORMAT_INFO &swr_desc = GetFormatInfo(
-            mesa_to_swr_format((enum pipe_format)attribs[i].src_format));
-         velems->stream_pitch[attribs[i].vertex_buffer_index] += swr_desc.Bpp;
-
-         if (attribs[i].instance_divisor != 0) {
-            velems->instanced_bufs |= 1U << attribs[i].vertex_buffer_index;
-            uint32_t *min_instance_div =
-               &velems->min_instance_div[attribs[i].vertex_buffer_index];
-            if (!*min_instance_div ||
-                attribs[i].instance_divisor < *min_instance_div)
-               *min_instance_div = attribs[i].instance_divisor;
-         }
-      }
-   }
-
-   return velems;
-}
-
-static void
-swr_bind_vertex_elements_state(struct pipe_context *pipe, void *velems)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   struct swr_vertex_element_state *swr_velems =
-      (struct swr_vertex_element_state *)velems;
-
-   ctx->velems = swr_velems;
-   ctx->dirty |= SWR_NEW_VERTEX;
-}
-
-static void
-swr_delete_vertex_elements_state(struct pipe_context *pipe, void *velems)
-{
-   struct swr_vertex_element_state *swr_velems =
-      (struct swr_vertex_element_state *) velems;
-   /* XXX Need to destroy fetch shader? */
-   delete swr_velems;
-}
-
-
-static void
-swr_set_vertex_buffers(struct pipe_context *pipe,
-                       unsigned start_slot,
-                       unsigned num_elements,
-                       unsigned unbind_num_trailing_slots,
-                       bool take_ownership,
-                       const struct pipe_vertex_buffer *buffers)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   assert(num_elements <= PIPE_MAX_ATTRIBS);
-
-   util_set_vertex_buffers_count(ctx->vertex_buffer,
-                                 &ctx->num_vertex_buffers,
-                                 buffers,
-                                 start_slot,
-                                 num_elements,
-                                 unbind_num_trailing_slots,
-                                 take_ownership);
-
-   ctx->dirty |= SWR_NEW_VERTEX;
-}
-
-
-static void
-swr_set_polygon_stipple(struct pipe_context *pipe,
-                        const struct pipe_poly_stipple *stipple)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   ctx->poly_stipple.pipe = *stipple; /* struct copy */
-   ctx->dirty |= SWR_NEW_STIPPLE;
-}
-
-static void
-swr_set_clip_state(struct pipe_context *pipe,
-                   const struct pipe_clip_state *clip)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   ctx->clip = *clip;
-   /* XXX Unimplemented, but prevents crash */
-
-   ctx->dirty |= SWR_NEW_CLIP;
-}
-
-
-static void
-swr_set_scissor_states(struct pipe_context *pipe,
-                       unsigned start_slot,
-                       unsigned num_scissors,
-                       const struct pipe_scissor_state *scissors)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   memcpy(ctx->scissors + start_slot, scissors,
-          sizeof(struct pipe_scissor_state) * num_scissors);
-
-   for (unsigned i = 0; i < num_scissors; i++) {
-      auto idx = start_slot + i;
-      ctx->swr_scissors[idx].xmin = scissors[idx].minx;
-      ctx->swr_scissors[idx].xmax = scissors[idx].maxx;
-      ctx->swr_scissors[idx].ymin = scissors[idx].miny;
-      ctx->swr_scissors[idx].ymax = scissors[idx].maxy;
-   }
-   ctx->dirty |= SWR_NEW_SCISSOR;
-}
-
-static void
-swr_set_viewport_states(struct pipe_context *pipe,
-                        unsigned start_slot,
-                        unsigned num_viewports,
-                        const struct pipe_viewport_state *vpt)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   memcpy(ctx->viewports + start_slot, vpt, sizeof(struct pipe_viewport_state) * num_viewports);
-   ctx->dirty |= SWR_NEW_VIEWPORT;
-}
-
-
-static void
-swr_set_framebuffer_state(struct pipe_context *pipe,
-                          const struct pipe_framebuffer_state *fb)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   bool changed = !util_framebuffer_state_equal(&ctx->framebuffer, fb);
-
-   assert(fb->width <= KNOB_GUARDBAND_WIDTH);
-   assert(fb->height <= KNOB_GUARDBAND_HEIGHT);
-
-   if (changed) {
-      util_copy_framebuffer_state(&ctx->framebuffer, fb);
-
-      /* 0 and 1 both indicate no msaa.  Core doesn't understand 0 samples */
-      ctx->framebuffer.samples = std::max((ubyte)1, ctx->framebuffer.samples);
-
-      ctx->dirty |= SWR_NEW_FRAMEBUFFER;
-   }
-}
-
-
-static void
-swr_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   if (sample_mask != ctx->sample_mask) {
-      ctx->sample_mask = sample_mask;
-      ctx->dirty |= SWR_NEW_RASTERIZER;
-   }
-}
-
-/*
- * MSAA fixed sample position table
- * used by update_derived and get_sample_position
- * (integer locations on a 16x16 grid)
- */
-static const uint8_t swr_sample_positions[][2] =
-{ /* 1x*/ { 8, 8},
-  /* 2x*/ {12,12},{ 4, 4},
-  /* 4x*/ { 6, 2},{14, 6},{ 2,10},{10,14},
-  /* 8x*/ { 9, 5},{ 7,11},{13, 9},{ 5, 3},
-          { 3,13},{ 1, 7},{11,15},{15, 1},
-  /*16x*/ { 9, 9},{ 7, 5},{ 5,10},{12, 7},
-          { 3, 6},{10,13},{13,11},{11, 3},
-          { 6,14},{ 8, 1},{ 4, 2},{ 2,12},
-          { 0, 8},{15, 4},{14,15},{ 1, 0} };
-
-static void
-swr_get_sample_position(struct pipe_context *pipe,
-                        unsigned sample_count, unsigned sample_index,
-                        float *out_value)
-{
-   /* validate sample_count */
-   sample_count = GetNumSamples(GetSampleCount(sample_count));
-
-   const uint8_t *sample = swr_sample_positions[sample_count-1 + sample_index];
-   out_value[0] = sample[0] / 16.0f;
-   out_value[1] = sample[1] / 16.0f;
-}
-
-
-/*
- * Update resource in-use status
- * All resources bound to color or depth targets marked as WRITE resources.
- * VBO Vertex/index buffers and texture views marked as READ resources.
- */
-void
-swr_update_resource_status(struct pipe_context *pipe,
-                           const struct pipe_draw_info *p_draw_info)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   struct pipe_framebuffer_state *fb = &ctx->framebuffer;
-
-   /* colorbuffer targets */
-   if (fb->nr_cbufs)
-      for (uint32_t i = 0; i < fb->nr_cbufs; ++i)
-         if (fb->cbufs[i])
-            swr_resource_write(fb->cbufs[i]->texture);
-
-   /* depth/stencil target */
-   if (fb->zsbuf)
-      swr_resource_write(fb->zsbuf->texture);
-
-   /* VBO vertex buffers */
-   for (uint32_t i = 0; i < ctx->num_vertex_buffers; i++) {
-      struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i];
-      if (!vb->is_user_buffer && vb->buffer.resource)
-         swr_resource_read(vb->buffer.resource);
-   }
-
-   /* VBO index buffer */
-   if (p_draw_info && p_draw_info->index_size) {
-      if (!p_draw_info->has_user_indices)
-         swr_resource_read(p_draw_info->index.resource);
-   }
-
-   /* transform feedback buffers */
-   for (uint32_t i = 0; i < ctx->num_so_targets; i++) {
-      struct pipe_stream_output_target *target = ctx->so_targets[i];
-      if (target && target->buffer)
-         swr_resource_write(target->buffer);
-   }
-
-   /* texture sampler views */
-   for (uint32_t j : {PIPE_SHADER_VERTEX, PIPE_SHADER_FRAGMENT}) {
-      for (uint32_t i = 0; i < ctx->num_sampler_views[j]; i++) {
-         struct pipe_sampler_view *view = ctx->sampler_views[j][i];
-         if (view)
-            swr_resource_read(view->texture);
-      }
-   }
-
-   /* constant buffers */
-   for (uint32_t j : {PIPE_SHADER_VERTEX, PIPE_SHADER_FRAGMENT}) {
-      for (uint32_t i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-         struct pipe_constant_buffer *cb = &ctx->constants[j][i];
-         if (cb->buffer)
-            swr_resource_read(cb->buffer);
-      }
-   }
-}
-
-static void
-swr_update_texture_state(struct swr_context *ctx,
-                         enum pipe_shader_type shader_type,
-                         unsigned num_sampler_views,
-                         swr_jit_texture *textures)
-{
-   for (unsigned i = 0; i < num_sampler_views; i++) {
-      struct pipe_sampler_view *view =
-         ctx->sampler_views[shader_type][i];
-      struct swr_jit_texture *jit_tex = &textures[i];
-
-      memset(jit_tex, 0, sizeof(*jit_tex));
-      if (view) {
-         struct pipe_resource *res = view->texture;
-         struct swr_resource *swr_res = swr_resource(res);
-         SWR_SURFACE_STATE *swr = &swr_res->swr;
-         size_t *mip_offsets = swr_res->mip_offsets;
-         if (swr_res->has_depth && swr_res->has_stencil &&
-            !util_format_has_depth(util_format_description(view->format))) {
-            swr = &swr_res->secondary;
-            mip_offsets = swr_res->secondary_mip_offsets;
-         }
-
-         jit_tex->width = res->width0;
-         jit_tex->height = res->height0;
-         jit_tex->base_ptr = (uint8_t*)swr->xpBaseAddress;
-         jit_tex->num_samples = swr->numSamples;
-         jit_tex->sample_stride = 0;
-         if (view->target != PIPE_BUFFER) {
-            jit_tex->first_level = view->u.tex.first_level;
-            jit_tex->last_level = view->u.tex.last_level;
-            if (view->target == PIPE_TEXTURE_3D)
-               jit_tex->depth = res->depth0;
-            else
-               jit_tex->depth =
-                  view->u.tex.last_layer - view->u.tex.first_layer + 1;
-            jit_tex->base_ptr += view->u.tex.first_layer *
-               swr->qpitch * swr->pitch;
-         } else {
-            unsigned view_blocksize = util_format_get_blocksize(view->format);
-            jit_tex->base_ptr += view->u.buf.offset;
-            jit_tex->width = view->u.buf.size / view_blocksize;
-            jit_tex->depth = 1;
-         }
-
-         for (unsigned level = jit_tex->first_level;
-              level <= jit_tex->last_level;
-              level++) {
-            jit_tex->row_stride[level] = swr->pitch;
-            jit_tex->img_stride[level] = swr->qpitch * swr->pitch;
-            jit_tex->mip_offsets[level] = mip_offsets[level];
-         }
-      }
-   }
-}
-
-static void
-swr_update_sampler_state(struct swr_context *ctx,
-                         enum pipe_shader_type shader_type,
-                         unsigned num_samplers,
-                         swr_jit_sampler *samplers)
-{
-   for (unsigned i = 0; i < num_samplers; i++) {
-      const struct pipe_sampler_state *sampler =
-         ctx->samplers[shader_type][i];
-
-      if (sampler) {
-         samplers[i].min_lod = sampler->min_lod;
-         samplers[i].max_lod = sampler->max_lod;
-         samplers[i].lod_bias = sampler->lod_bias;
-         COPY_4V(samplers[i].border_color, sampler->border_color.f);
-      }
-   }
-}
-
-static void
-swr_update_constants(struct swr_context *ctx, enum pipe_shader_type shaderType)
-{
-   swr_draw_context *pDC = &ctx->swrDC;
-
-   const float **constant;
-   uint32_t *num_constants;
-   struct swr_scratch_space *scratch;
-
-   switch (shaderType) {
-   case PIPE_SHADER_VERTEX:
-      constant = pDC->constantVS;
-      num_constants = pDC->num_constantsVS;
-      scratch = &ctx->scratch->vs_constants;
-      break;
-   case PIPE_SHADER_FRAGMENT:
-      constant = pDC->constantFS;
-      num_constants = pDC->num_constantsFS;
-      scratch = &ctx->scratch->fs_constants;
-      break;
-   case PIPE_SHADER_GEOMETRY:
-      constant = pDC->constantGS;
-      num_constants = pDC->num_constantsGS;
-      scratch = &ctx->scratch->gs_constants;
-      break;
-   case PIPE_SHADER_TESS_CTRL:
-      constant = pDC->constantTCS;
-      num_constants = pDC->num_constantsTCS;
-      scratch = &ctx->scratch->tcs_constants;
-      break;
-   case PIPE_SHADER_TESS_EVAL:
-      constant = pDC->constantTES;
-      num_constants = pDC->num_constantsTES;
-      scratch = &ctx->scratch->tes_constants;
-      break;
-   default:
-      assert(0 && "Unsupported shader type constants");
-      return;
-   }
-
-   for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-      const pipe_constant_buffer *cb = &ctx->constants[shaderType][i];
-      num_constants[i] = cb->buffer_size;
-      if (cb->buffer) {
-         constant[i] =
-            (const float *)(swr_resource_data(cb->buffer) +
-                            cb->buffer_offset);
-      } else {
-         /* Need to copy these constants to scratch space */
-         if (cb->user_buffer && cb->buffer_size) {
-            const void *ptr =
-               ((const uint8_t *)cb->user_buffer + cb->buffer_offset);
-            uint32_t size = AlignUp(cb->buffer_size, 4);
-            ptr = swr_copy_to_scratch_space(ctx, scratch, ptr, size);
-            constant[i] = (const float *)ptr;
-         }
-      }
-   }
-}
-
-static bool
-swr_change_rt(struct swr_context *ctx,
-              unsigned attachment,
-              const struct pipe_surface *sf)
-{
-   swr_draw_context *pDC = &ctx->swrDC;
-   struct SWR_SURFACE_STATE *rt = &pDC->renderTargets[attachment];
-
-   /* Do nothing if the render target hasn't changed */
-   if ((!sf || !sf->texture) && (void*)(rt->xpBaseAddress) == nullptr)
-      return false;
-
-   /* Deal with disabling RT up front */
-   if (!sf || !sf->texture) {
-      /* If detaching attachment, mark tiles as RESOLVED so core
-       * won't try to load from non-existent target. */
-      swr_store_render_target(&ctx->pipe, attachment, SWR_TILE_RESOLVED);
-      *rt = {0};
-      return true;
-   }
-
-   const struct swr_resource *swr = swr_resource(sf->texture);
-   const SWR_SURFACE_STATE *swr_surface = &swr->swr;
-   SWR_FORMAT fmt = mesa_to_swr_format(sf->format);
-
-   if (attachment == SWR_ATTACHMENT_STENCIL && swr->secondary.xpBaseAddress) {
-      swr_surface = &swr->secondary;
-      fmt = swr_surface->format;
-   }
-
-   if (rt->xpBaseAddress == swr_surface->xpBaseAddress &&
-       rt->format == fmt &&
-       rt->lod == sf->u.tex.level &&
-       rt->arrayIndex == sf->u.tex.first_layer)
-      return false;
-
-   bool need_fence = false;
-
-   /* StoreTile for changed target */
-   if (rt->xpBaseAddress) {
-      /* If changing attachment to a new target, mark tiles as
-       * INVALID so they are reloaded from surface. */
-      swr_store_render_target(&ctx->pipe, attachment, SWR_TILE_INVALID);
-      need_fence = true;
-   } else {
-      /* if no previous attachment, invalidate tiles that may be marked
-       * RESOLVED because of an old attachment */
-      swr_invalidate_render_target(&ctx->pipe, attachment, sf->width, sf->height);
-      /* no need to set fence here */
-   }
-
-   /* Make new attachment */
-   *rt = *swr_surface;
-   rt->format = fmt;
-   rt->lod = sf->u.tex.level;
-   rt->arrayIndex = sf->u.tex.first_layer;
-
-   return need_fence;
-}
-
-/*
- * for cases where resources are shared between contexts, invalidate
- * this ctx's resource. so it can be fetched fresh.  Old ctx's resource
- * is already stored during a flush
- */
-static inline void
-swr_invalidate_buffers_after_ctx_change(struct pipe_context *pipe)
-{
-   struct swr_context *ctx = swr_context(pipe);
-
-   for (uint32_t i = 0; i < ctx->framebuffer.nr_cbufs; i++) {
-      struct pipe_surface *cb = ctx->framebuffer.cbufs[i];
-      if (cb) {
-         struct swr_resource *res = swr_resource(cb->texture);
-         if (res->curr_pipe != pipe) {
-            /* if curr_pipe is NULL (first use), status should not be WRITE */
-            assert(res->curr_pipe || !(res->status & SWR_RESOURCE_WRITE));
-            if (res->status & SWR_RESOURCE_WRITE) {
-               swr_invalidate_render_target(pipe, i, cb->width, cb->height);
-            }
-         }
-         res->curr_pipe = pipe;
-      }
-   }
-   if (ctx->framebuffer.zsbuf) {
-      struct pipe_surface *zb = ctx->framebuffer.zsbuf;
-      if (zb) {
-         struct swr_resource *res = swr_resource(zb->texture);
-         if (res->curr_pipe != pipe) {
-            /* if curr_pipe is NULL (first use), status should not be WRITE */
-            assert(res->curr_pipe || !(res->status & SWR_RESOURCE_WRITE));
-            if (res->status & SWR_RESOURCE_WRITE) {
-               swr_invalidate_render_target(pipe, SWR_ATTACHMENT_DEPTH, zb->width, zb->height);
-               swr_invalidate_render_target(pipe, SWR_ATTACHMENT_STENCIL, zb->width, zb->height);
-            }
-         }
-         res->curr_pipe = pipe;
-      }
-   }
-}
-
-static inline void
-swr_user_vbuf_range(const struct pipe_draw_info *info,
-                    const struct swr_vertex_element_state *velems,
-                    const struct pipe_vertex_buffer *vb,
-                    uint32_t i,
-                    uint32_t *totelems,
-                    uint32_t *base,
-                    uint32_t *size,
-                    int index_bias)
-{
-   /* FIXME: The size is too large - we don't access the full extra stride. */
-   unsigned elems;
-   unsigned elem_pitch = vb->stride + velems->stream_pitch[i];
-   if (velems->instanced_bufs & (1U << i)) {
-      elems = info->instance_count / velems->min_instance_div[i] + 1;
-      *totelems = info->start_instance + elems;
-      *base = info->start_instance * vb->stride;
-      *size = elems * elem_pitch;
-   } else if (vb->stride) {
-      elems = info->max_index - info->min_index + 1;
-      *totelems = (info->max_index + (info->index_size ? index_bias : 0)) + 1;
-      *base = (info->min_index + (info->index_size ? index_bias : 0)) * vb->stride;
-      *size = elems * elem_pitch;
-   } else {
-      *totelems = 1;
-      *base = 0;
-      *size = velems->stream_pitch[i];
-   }
-}
-
-static void
-swr_update_poly_stipple(struct swr_context *ctx)
-{
-   struct swr_draw_context *pDC = &ctx->swrDC;
-
-   assert(sizeof(ctx->poly_stipple.pipe.stipple) == sizeof(pDC->polyStipple));
-   memcpy(pDC->polyStipple,
-          ctx->poly_stipple.pipe.stipple,
-          sizeof(ctx->poly_stipple.pipe.stipple));
-}
-
-
-static struct tgsi_shader_info *
-swr_get_last_fe(const struct swr_context *ctx)
-{
-   tgsi_shader_info *pLastFE = &ctx->vs->info.base;
-
-   if (ctx->gs) {
-      pLastFE = &ctx->gs->info.base;
-   }
-   else if (ctx->tes) {
-      pLastFE = &ctx->tes->info.base;
-   }
-   else if (ctx->tcs) {
-      pLastFE = &ctx->tcs->info.base;
-   }
-   return pLastFE;
-}
-
-
-void
-swr_update_derived(struct pipe_context *pipe,
-                   const struct pipe_draw_info *p_draw_info,
-                   const struct pipe_draw_start_count_bias *draw)
-{
-   struct swr_context *ctx = swr_context(pipe);
-   struct swr_screen *screen = swr_screen(pipe->screen);
-
-   /* When called from swr_clear (p_draw_info = null), set any null
-    * state-objects to the dummy state objects to prevent nullptr dereference
-    * in validation below.
-    *
-    * Important that this remains static for zero initialization.  These
-    * aren't meant to be proper state objects, just empty structs. They will
-    * not be written to.
-    *
-    * Shaders can't be part of the union since they contain std::unordered_map
-    */
-   static struct {
-      union {
-         struct pipe_rasterizer_state rasterizer;
-         struct pipe_depth_stencil_alpha_state depth_stencil;
-         struct swr_blend_state blend;
-      } state;
-      struct swr_vertex_shader vs;
-      struct swr_fragment_shader fs;
-   } swr_dummy;
-
-   if (!p_draw_info) {
-      if (!ctx->rasterizer)
-         ctx->rasterizer = &swr_dummy.state.rasterizer;
-      if (!ctx->depth_stencil)
-         ctx->depth_stencil = &swr_dummy.state.depth_stencil;
-      if (!ctx->blend)
-         ctx->blend = &swr_dummy.state.blend;
-      if (!ctx->vs)
-         ctx->vs = &swr_dummy.vs;
-      if (!ctx->fs)
-         ctx->fs = &swr_dummy.fs;
-   }
-
-   /* Update screen->pipe to current pipe context. */
-   screen->pipe = pipe;
-
-   /* Any state that requires dirty flags to be re-triggered sets this mask */
-   /* For example, user_buffer vertex and index buffers. */
-   unsigned post_update_dirty_flags = 0;
-
-   /* bring resources that changed context up-to-date */
-   swr_invalidate_buffers_after_ctx_change(pipe);
-
-   /* Render Targets */
-   if (ctx->dirty & SWR_NEW_FRAMEBUFFER) {
-      struct pipe_framebuffer_state *fb = &ctx->framebuffer;
-      const struct util_format_description *desc = NULL;
-      bool need_fence = false;
-
-      /* colorbuffer targets */
-      if (fb->nr_cbufs) {
-         for (unsigned i = 0; i < fb->nr_cbufs; ++i)
-            need_fence |= swr_change_rt(
-                  ctx, SWR_ATTACHMENT_COLOR0 + i, fb->cbufs[i]);
-      }
-      for (unsigned i = fb->nr_cbufs; i < SWR_NUM_RENDERTARGETS; ++i)
-         need_fence |= swr_change_rt(ctx, SWR_ATTACHMENT_COLOR0 + i, NULL);
-
-      /* depth/stencil target */
-      if (fb->zsbuf)
-         desc = util_format_description(fb->zsbuf->format);
-      if (fb->zsbuf && util_format_has_depth(desc))
-         need_fence |= swr_change_rt(ctx, SWR_ATTACHMENT_DEPTH, fb->zsbuf);
-      else
-         need_fence |= swr_change_rt(ctx, SWR_ATTACHMENT_DEPTH, NULL);
-
-      if (fb->zsbuf && util_format_has_stencil(desc))
-         need_fence |= swr_change_rt(ctx, SWR_ATTACHMENT_STENCIL, fb->zsbuf);
-      else
-         need_fence |= swr_change_rt(ctx, SWR_ATTACHMENT_STENCIL, NULL);
-
-      /* This fence ensures any attachment changes are resolved before the
-       * next draw */
-      if (need_fence)
-         swr_fence_submit(ctx, screen->flush_fence);
-   }
-
-   /* Raster state */
-   if (ctx->dirty & (SWR_NEW_RASTERIZER |
-                     SWR_NEW_VS | // clipping
-                     SWR_NEW_TES |
-                     SWR_NEW_TCS |
-                     SWR_NEW_FRAMEBUFFER)) {
-      pipe_rasterizer_state *rasterizer = ctx->rasterizer;
-      pipe_framebuffer_state *fb = &ctx->framebuffer;
-
-      SWR_RASTSTATE *rastState = &ctx->derived.rastState;
-      rastState->cullMode = swr_convert_cull_mode(rasterizer->cull_face);
-      rastState->frontWinding = rasterizer->front_ccw
-         ? SWR_FRONTWINDING_CCW
-         : SWR_FRONTWINDING_CW;
-      rastState->scissorEnable = rasterizer->scissor;
-      rastState->pointSize = rasterizer->point_size > 0.0f
-         ? rasterizer->point_size
-         : 1.0f;
-      rastState->lineWidth = rasterizer->line_width > 0.0f
-         ? rasterizer->line_width
-         : 1.0f;
-
-      rastState->pointParam = rasterizer->point_size_per_vertex;
-
-      rastState->pointSpriteEnable = rasterizer->sprite_coord_enable;
-      rastState->pointSpriteTopOrigin =
-         rasterizer->sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT;
-
-      /* If SWR_MSAA_FORCE_ENABLE is set, turn msaa on */
-      if (screen->msaa_force_enable && !rasterizer->multisample) {
-         /* Force enable and use the value the surface was created with */
-         rasterizer->multisample = true;
-         fb->samples = swr_resource(fb->cbufs[0]->texture)->swr.numSamples;
-         fprintf(stderr,"msaa force enable: %d samples\n", fb->samples);
-      }
-
-      rastState->sampleCount = GetSampleCount(fb->samples);
-      rastState->forcedSampleCount = false;
-      rastState->bIsCenterPattern = !rasterizer->multisample;
-      rastState->pixelLocation = SWR_PIXEL_LOCATION_CENTER;
-
-      /* Only initialize sample positions if msaa is enabled */
-      if (rasterizer->multisample) {
-         for (uint32_t i = 0; i < fb->samples; i++) {
-            const uint8_t *sample = swr_sample_positions[fb->samples-1 + i];
-            rastState->samplePositions.SetXi(i, sample[0] << 4);
-            rastState->samplePositions.SetYi(i, sample[1] << 4);
-            rastState->samplePositions.SetX (i, sample[0] / 16.0f);
-            rastState->samplePositions.SetY (i, sample[1] / 16.0f);
-         }
-         rastState->samplePositions.PrecalcSampleData(fb->samples);
-      }
-
-      bool do_offset = false;
-      switch (rasterizer->fill_front) {
-      case PIPE_POLYGON_MODE_FILL:
-         do_offset = rasterizer->offset_tri;
-         break;
-      case PIPE_POLYGON_MODE_LINE:
-         do_offset = rasterizer->offset_line;
-         break;
-      case PIPE_POLYGON_MODE_POINT:
-         do_offset = rasterizer->offset_point;
-         break;
-      }
-
-      if (do_offset) {
-         rastState->depthBias = rasterizer->offset_units;
-         rastState->slopeScaledDepthBias = rasterizer->offset_scale;
-         rastState->depthBiasClamp = rasterizer->offset_clamp;
-      } else {
-         rastState->depthBias = 0;
-         rastState->slopeScaledDepthBias = 0;
-         rastState->depthBiasClamp = 0;
-      }
-
-      /* translate polygon mode, at least for the front==back case */
-      rastState->fillMode = swr_convert_fill_mode(rasterizer->fill_front);
-
-      struct pipe_surface *zb = fb->zsbuf;
-      if (zb && swr_resource(zb->texture)->has_depth)
-         rastState->depthFormat = swr_resource(zb->texture)->swr.format;
-
-      rastState->depthClipEnable = rasterizer->depth_clip_near;
-      rastState->clipEnable = rasterizer->depth_clip_near | rasterizer->depth_clip_far;
-      rastState->clipHalfZ = rasterizer->clip_halfz;
-
-      ctx->api.pfnSwrSetRastState(ctx->swrContext, rastState);
-   }
-
-   /* Viewport */
-   if (ctx->dirty & (SWR_NEW_VIEWPORT | SWR_NEW_FRAMEBUFFER
-                     | SWR_NEW_RASTERIZER)) {
-      pipe_viewport_state *state = &ctx->viewports[0];
-      pipe_framebuffer_state *fb = &ctx->framebuffer;
-      pipe_rasterizer_state *rasterizer = ctx->rasterizer;
-
-      SWR_VIEWPORT *vp = &ctx->derived.vp[0];
-      SWR_VIEWPORT_MATRICES *vpm = &ctx->derived.vpm;
-
-      for (unsigned i = 0; i < KNOB_NUM_VIEWPORTS_SCISSORS; i++) {
-         vp->x = state->translate[0] - state->scale[0];
-         vp->width = 2 * state->scale[0];
-         vp->y = state->translate[1] - fabs(state->scale[1]);
-         vp->height = 2 * fabs(state->scale[1]);
-         util_viewport_zmin_zmax(state, rasterizer->clip_halfz,
-                                 &vp->minZ, &vp->maxZ);
-
-         if (rasterizer->depth_clip_near) {
-            vp->minZ = 0.0f;
-         }
-
-         if (rasterizer->depth_clip_far) {
-            vp->maxZ = 1.0f;
-         }
-
-         vpm->m00[i] = state->scale[0];
-         vpm->m11[i] = state->scale[1];
-         vpm->m22[i] = state->scale[2];
-         vpm->m30[i] = state->translate[0];
-         vpm->m31[i] = state->translate[1];
-         vpm->m32[i] = state->translate[2];
-
-         /* Now that the matrix is calculated, clip the view coords to screen
-          * size.  OpenGL allows for -ve x,y in the viewport. */
-         if (vp->x < 0.0f) {
-            vp->width += vp->x;
-            vp->x = 0.0f;
-         }
-         if (vp->y < 0.0f) {
-            vp->height += vp->y;
-            vp->y = 0.0f;
-         }
-         vp->width = std::min(vp->width, (float) fb->width - vp->x);
-         vp->height = std::min(vp->height, (float) fb->height - vp->y);
-
-         vp++;
-         state++;
-      }
-      ctx->api.pfnSwrSetViewports(ctx->swrContext, KNOB_NUM_VIEWPORTS_SCISSORS,
-                                  &ctx->derived.vp[0], &ctx->derived.vpm);
-   }
-
-   /* When called from swr_clear (p_draw_info = null), render targets,
-    * rasterState and viewports (dependent on render targets) are the only
-    * necessary validation.  Defer remaining validation by setting
-    * post_update_dirty_flags and clear all dirty flags.  BackendState is
-    * still unconditionally validated below */
-   if (!p_draw_info) {
-      post_update_dirty_flags = ctx->dirty & ~(SWR_NEW_FRAMEBUFFER |
-                                               SWR_NEW_RASTERIZER |
-                                               SWR_NEW_VIEWPORT);
-      ctx->dirty = 0;
-   }
-
-   /* Scissor */
-   if (ctx->dirty & SWR_NEW_SCISSOR) {
-      ctx->api.pfnSwrSetScissorRects(ctx->swrContext, KNOB_NUM_VIEWPORTS_SCISSORS, ctx->swr_scissors);
-   }
-
-   /* Set vertex & index buffers */
-   if (ctx->dirty & SWR_NEW_VERTEX) {
-      const struct pipe_draw_info &info = *p_draw_info;
-
-      /* vertex buffers */
-      SWR_VERTEX_BUFFER_STATE swrVertexBuffers[PIPE_MAX_ATTRIBS];
-      for (UINT i = 0; i < ctx->num_vertex_buffers; i++) {
-         uint32_t size = 0, pitch = 0, elems = 0, partial_inbounds = 0;
-         uint32_t min_vertex_index = 0;
-         const uint8_t *p_data;
-         struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i];
-
-         pitch = vb->stride;
-         if (vb->is_user_buffer) {
-            /* Client buffer
-             * client memory is one-time use, re-trigger SWR_NEW_VERTEX to
-             * revalidate on each draw */
-            post_update_dirty_flags |= SWR_NEW_VERTEX;
-
-            uint32_t base;
-            swr_user_vbuf_range(&info, ctx->velems, vb, i, &elems, &base, &size, draw->index_bias);
-            partial_inbounds = 0;
-            min_vertex_index = info.min_index + (info.index_size ? draw->index_bias : 0);
-
-            size = AlignUp(size, 4);
-            /* If size of client memory copy is too large, don't copy. The
-             * draw will access user-buffer directly and then block.  This is
-             * faster than queuing many large client draws. */
-            if (size >= screen->client_copy_limit) {
-               post_update_dirty_flags |= SWR_BLOCK_CLIENT_DRAW;
-               p_data = (const uint8_t *) vb->buffer.user;
-            } else {
-               /* Copy only needed vertices to scratch space */
-               const void *ptr = (const uint8_t *) vb->buffer.user + base;
-               ptr = (uint8_t *)swr_copy_to_scratch_space(
-                     ctx, &ctx->scratch->vertex_buffer, ptr, size);
-               p_data = (const uint8_t *)ptr - base;
-            }
-         } else if (vb->buffer.resource) {
-            /* VBO */
-            if (!pitch) {
-               /* If pitch=0 (ie vb->stride), buffer contains a single
-                * constant attribute.  Use the stream_pitch which was
-                * calculated during creation of vertex_elements_state for the
-                * size of the attribute. */
-               size = ctx->velems->stream_pitch[i];
-               elems = 1;
-               partial_inbounds = 0;
-               min_vertex_index = 0;
-            } else {
-               /* size is based on buffer->width0 rather than info.max_index
-                * to prevent having to validate VBO on each draw. */
-               size = vb->buffer.resource->width0;
-               elems = size / pitch;
-               partial_inbounds = size % pitch;
-               min_vertex_index = 0;
-            }
-
-            p_data = swr_resource_data(vb->buffer.resource) + vb->buffer_offset;
-         } else
-            p_data = NULL;
-
-         swrVertexBuffers[i] = {0};
-         swrVertexBuffers[i].index = i;
-         swrVertexBuffers[i].pitch = pitch;
-         swrVertexBuffers[i].xpData = (gfxptr_t) p_data;
-         swrVertexBuffers[i].size = size;
-         swrVertexBuffers[i].minVertex = min_vertex_index;
-         swrVertexBuffers[i].maxVertex = elems;
-         swrVertexBuffers[i].partialInboundsSize = partial_inbounds;
-      }
-
-      ctx->api.pfnSwrSetVertexBuffers(
-         ctx->swrContext, ctx->num_vertex_buffers, swrVertexBuffers);
-
-      /* index buffer, if required (info passed in by swr_draw_vbo) */
-      SWR_FORMAT index_type = R32_UINT; /* Default for non-indexed draws */
-      if (info.index_size) {
-         const uint8_t *p_data;
-         uint32_t size, pitch;
-
-         pitch = info.index_size ? info.index_size : sizeof(uint32_t);
-         index_type = swr_convert_index_type(pitch);
-
-         if (!info.has_user_indices) {
-            /* VBO
-             * size is based on buffer->width0 rather than info.count
-             * to prevent having to validate VBO on each draw */
-            size = info.index.resource->width0;
-            p_data = swr_resource_data(info.index.resource);
-         } else {
-            /* Client buffer
-             * client memory is one-time use, re-trigger SWR_NEW_VERTEX to
-             * revalidate on each draw */
-            post_update_dirty_flags |= SWR_NEW_VERTEX;
-
-            size = draw->count * pitch;
-
-            size = AlignUp(size, 4);
-            /* If size of client memory copy is too large, don't copy. The
-             * draw will access user-buffer directly and then block.  This is
-             * faster than queuing many large client draws. */
-            if (size >= screen->client_copy_limit) {
-               post_update_dirty_flags |= SWR_BLOCK_CLIENT_DRAW;
-               p_data = (const uint8_t *) info.index.user +
-                        draw->start * info.index_size;
-            } else {
-               /* Copy indices to scratch space */
-               const void *ptr = (char*)info.index.user +
-                                 draw->start * info.index_size;
-               ptr = swr_copy_to_scratch_space(
-                     ctx, &ctx->scratch->index_buffer, ptr, size);
-               p_data = (const uint8_t *)ptr;
-            }
-         }
-
-         SWR_INDEX_BUFFER_STATE swrIndexBuffer;
-         swrIndexBuffer.format = swr_convert_index_type(info.index_size);
-         swrIndexBuffer.xpIndices = (gfxptr_t) p_data;
-         swrIndexBuffer.size = size;
-
-         ctx->api.pfnSwrSetIndexBuffer(ctx->swrContext, &swrIndexBuffer);
-      }
-
-      struct swr_vertex_element_state *velems = ctx->velems;
-      if (velems && velems->fsState.indexType != index_type) {
-         velems->fsFunc = NULL;
-         velems->fsState.indexType = index_type;
-      }
-   }
-
-   /* GeometryShader */
-   if (ctx->dirty & (SWR_NEW_GS |
-                     SWR_NEW_VS |
-                     SWR_NEW_TCS |
-                     SWR_NEW_TES |
-                     SWR_NEW_SAMPLER |
-                     SWR_NEW_SAMPLER_VIEW)) {
-      if (ctx->gs) {
-         swr_jit_gs_key key;
-         swr_generate_gs_key(key, ctx, ctx->gs);
-         auto search = ctx->gs->map.find(key);
-         PFN_GS_FUNC func;
-         if (search != ctx->gs->map.end()) {
-            func = search->second->shader;
-         } else {
-            func = swr_compile_gs(ctx, key);
-         }
-         ctx->api.pfnSwrSetGsFunc(ctx->swrContext, func);
-
-         /* JIT sampler state */
-         if (ctx->dirty & SWR_NEW_SAMPLER) {
-            swr_update_sampler_state(ctx,
-                                     PIPE_SHADER_GEOMETRY,
-                                     key.nr_samplers,
-                                     ctx->swrDC.samplersGS);
-         }
-
-         /* JIT sampler view state */
-         if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) {
-            swr_update_texture_state(ctx,
-                                     PIPE_SHADER_GEOMETRY,
-                                     key.nr_sampler_views,
-                                     ctx->swrDC.texturesGS);
-         }
-
-         ctx->api.pfnSwrSetGsState(ctx->swrContext, &ctx->gs->gsState);
-      } else {
-         SWR_GS_STATE state = { 0 };
-         ctx->api.pfnSwrSetGsState(ctx->swrContext, &state);
-         ctx->api.pfnSwrSetGsFunc(ctx->swrContext, NULL);
-      }
-   }
-
-   // We may need to restore tessellation state
-   // This restored state may be however overwritten
-   // during shader compilation
-   if (ctx->dirty & SWR_NEW_TS) {
-      if (ctx->tes != nullptr) {
-         ctx->tsState = ctx->tes->ts_state;
-         ctx->api.pfnSwrSetTsState(ctx->swrContext, &ctx->tsState);
-      } else {
-         SWR_TS_STATE state = { 0 };
-         ctx->api.pfnSwrSetTsState(ctx->swrContext, &state);
-      }
-   }
-
-   // Tessellation Evaluation Shader
-   // Compile TES first, because TCS is optional
-   if (ctx->dirty & (SWR_NEW_GS |
-                     SWR_NEW_VS |
-                     SWR_NEW_TCS |
-                     SWR_NEW_TES |
-                     SWR_NEW_SAMPLER |
-                     SWR_NEW_SAMPLER_VIEW)) {
-      if (ctx->tes) {
-         swr_jit_tes_key key;
-         swr_generate_tes_key(key, ctx, ctx->tes);
-
-         auto search = ctx->tes->map.find(key);
-         PFN_TES_FUNC func;
-         if (search != ctx->tes->map.end()) {
-            func = search->second->shader;
-         } else {
-            func = swr_compile_tes(ctx, key);
-         }
-
-         ctx->api.pfnSwrSetDsFunc(ctx->swrContext, func);
-
-         /* JIT sampler state */
-         if (ctx->dirty & SWR_NEW_SAMPLER) {
-            swr_update_sampler_state(ctx,
-                                     PIPE_SHADER_TESS_EVAL,
-                                     key.nr_samplers,
-                                     ctx->swrDC.samplersTES);
-         }
-
-         /* JIT sampler view state */
-         if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) {
-            swr_update_texture_state(ctx,
-                                     PIPE_SHADER_TESS_EVAL,
-                                     key.nr_sampler_views,
-                                     ctx->swrDC.texturesTES);
-         }
-
-         // Update tessellation state in case it's been updated
-         ctx->api.pfnSwrSetTsState(ctx->swrContext, &ctx->tsState);
-      } else {
-         ctx->api.pfnSwrSetDsFunc(ctx->swrContext, NULL);
-      }
-   }
-
-   /* Tessellation Control Shader */
-   if (ctx->dirty & (SWR_NEW_GS |
-                     SWR_NEW_VS |
-                     SWR_NEW_TCS |
-                     SWR_NEW_TES |
-                     SWR_NEW_SAMPLER |
-                     SWR_NEW_SAMPLER_VIEW)) {
-      if (ctx->tcs) {
-         ctx->tcs->vertices_per_patch = ctx->patch_vertices;
-
-         swr_jit_tcs_key key;
-         swr_generate_tcs_key(key, ctx, ctx->tcs);
-
-         auto search = ctx->tcs->map.find(key);
-         PFN_TCS_FUNC func;
-         if (search != ctx->tcs->map.end()) {
-            func = search->second->shader;
-         } else {
-            func = swr_compile_tcs(ctx, key);
-         }
-
-         ctx->api.pfnSwrSetHsFunc(ctx->swrContext, func);
-
-         /* JIT sampler state */
-         if (ctx->dirty & SWR_NEW_SAMPLER) {
-            swr_update_sampler_state(ctx,
-                                     PIPE_SHADER_TESS_CTRL,
-                                     key.nr_samplers,
-                                     ctx->swrDC.samplersTCS);
-         }
-
-         /* JIT sampler view state */
-         if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) {
-            swr_update_texture_state(ctx,
-                                     PIPE_SHADER_TESS_CTRL,
-                                     key.nr_sampler_views,
-                                     ctx->swrDC.texturesTCS);
-         }
-
-         // Update tessellation state in case it's been updated
-         ctx->api.pfnSwrSetTsState(ctx->swrContext, &ctx->tsState);
-      } else {
-         ctx->api.pfnSwrSetHsFunc(ctx->swrContext, NULL);
-      }
-   }
-
-   /* VertexShader */
-   if (ctx->dirty
-       & (SWR_NEW_VS | SWR_NEW_RASTERIZER | // for clip planes
-          SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) {
-      swr_jit_vs_key key;
-      swr_generate_vs_key(key, ctx, ctx->vs);
-      auto search = ctx->vs->map.find(key);
-      PFN_VERTEX_FUNC func;
-      if (search != ctx->vs->map.end()) {
-         func = search->second->shader;
-      } else {
-         func = swr_compile_vs(ctx, key);
-      }
-      ctx->api.pfnSwrSetVertexFunc(ctx->swrContext, func);
-
-      /* JIT sampler state */
-      if (ctx->dirty & SWR_NEW_SAMPLER) {
-         swr_update_sampler_state(
-            ctx, PIPE_SHADER_VERTEX, key.nr_samplers, ctx->swrDC.samplersVS);
-      }
-
-      /* JIT sampler view state */
-      if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) {
-         swr_update_texture_state(ctx,
-                                  PIPE_SHADER_VERTEX,
-                                  key.nr_sampler_views,
-                                  ctx->swrDC.texturesVS);
-      }
-   }
-
-   /* work around the fact that poly stipple also affects lines */
-   /* and points, since we rasterize them as triangles, too */
-   /* Has to be before fragment shader, since it sets SWR_NEW_FS */
-   if (p_draw_info) {
-      bool new_prim_is_poly =
-         (u_reduced_prim(p_draw_info->mode) == PIPE_PRIM_TRIANGLES) &&
-         (ctx->derived.rastState.fillMode == SWR_FILLMODE_SOLID);
-      if (new_prim_is_poly != ctx->poly_stipple.prim_is_poly) {
-         ctx->dirty |= SWR_NEW_FS;
-         ctx->poly_stipple.prim_is_poly = new_prim_is_poly;
-      }
-   }
-
-   /* FragmentShader */
-   if (ctx->dirty & (SWR_NEW_FS |
-                     SWR_NEW_VS |
-                     SWR_NEW_GS |
-                     SWR_NEW_TES |
-                     SWR_NEW_TCS |
-                     SWR_NEW_RASTERIZER |
-                     SWR_NEW_SAMPLER |
-                     SWR_NEW_SAMPLER_VIEW |
-                     SWR_NEW_FRAMEBUFFER)) {
-      swr_jit_fs_key key;
-      swr_generate_fs_key(key, ctx, ctx->fs);
-      auto search = ctx->fs->map.find(key);
-      PFN_PIXEL_KERNEL func;
-      if (search != ctx->fs->map.end()) {
-         func = search->second->shader;
-      } else {
-         func = swr_compile_fs(ctx, key);
-      }
-      SWR_PS_STATE psState = {0};
-      psState.pfnPixelShader = func;
-      psState.killsPixel = ctx->fs->info.base.uses_kill;
-      psState.inputCoverage = SWR_INPUT_COVERAGE_NORMAL;
-      psState.writesODepth = ctx->fs->info.base.writes_z;
-      psState.usesSourceDepth = ctx->fs->info.base.reads_z;
-      psState.shadingRate = SWR_SHADING_RATE_PIXEL;
-      psState.renderTargetMask = (1 << ctx->framebuffer.nr_cbufs) - 1;
-      psState.posOffset = SWR_PS_POSITION_SAMPLE_NONE;
-      uint32_t barycentricsMask = 0;
-#if 0
-      // when we switch to mesa-master
-      if (ctx->fs->info.base.uses_persp_center ||
-          ctx->fs->info.base.uses_linear_center)
-         barycentricsMask |= SWR_BARYCENTRIC_PER_PIXEL_MASK;
-      if (ctx->fs->info.base.uses_persp_centroid ||
-          ctx->fs->info.base.uses_linear_centroid)
-         barycentricsMask |= SWR_BARYCENTRIC_CENTROID_MASK;
-      if (ctx->fs->info.base.uses_persp_sample ||
-          ctx->fs->info.base.uses_linear_sample)
-         barycentricsMask |= SWR_BARYCENTRIC_PER_SAMPLE_MASK;
-#else
-      for (unsigned i = 0; i < ctx->fs->info.base.num_inputs; i++) {
-         switch (ctx->fs->info.base.input_interpolate_loc[i]) {
-         case TGSI_INTERPOLATE_LOC_CENTER:
-            barycentricsMask |= SWR_BARYCENTRIC_PER_PIXEL_MASK;
-            break;
-         case TGSI_INTERPOLATE_LOC_CENTROID:
-            barycentricsMask |= SWR_BARYCENTRIC_CENTROID_MASK;
-            break;
-         case TGSI_INTERPOLATE_LOC_SAMPLE:
-            barycentricsMask |= SWR_BARYCENTRIC_PER_SAMPLE_MASK;
-            break;
-         }
-      }
-#endif
-      psState.barycentricsMask = barycentricsMask;
-      psState.usesUAV = false; // XXX
-      psState.forceEarlyZ = false;
-      ctx->api.pfnSwrSetPixelShaderState(ctx->swrContext, &psState);
-
-      /* JIT sampler state */
-      if (ctx->dirty & (SWR_NEW_SAMPLER |
-                        SWR_NEW_FS)) {
-         swr_update_sampler_state(ctx,
-                                  PIPE_SHADER_FRAGMENT,
-                                  key.nr_samplers,
-                                  ctx->swrDC.samplersFS);
-      }
-
-      /* JIT sampler view state */
-      if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW |
-                        SWR_NEW_FRAMEBUFFER |
-                        SWR_NEW_FS)) {
-         swr_update_texture_state(ctx,
-                                  PIPE_SHADER_FRAGMENT,
-                                  key.nr_sampler_views,
-                                  ctx->swrDC.texturesFS);
-      }
-   }
-
-
-   /* VertexShader Constants */
-   if (ctx->dirty & SWR_NEW_VSCONSTANTS) {
-      swr_update_constants(ctx, PIPE_SHADER_VERTEX);
-   }
-
-   /* FragmentShader Constants */
-   if (ctx->dirty & SWR_NEW_FSCONSTANTS) {
-      swr_update_constants(ctx, PIPE_SHADER_FRAGMENT);
-   }
-
-   /* GeometryShader Constants */
-   if (ctx->dirty & SWR_NEW_GSCONSTANTS) {
-      swr_update_constants(ctx, PIPE_SHADER_GEOMETRY);
-   }
-
-   /* Tessellation Control Shader Constants */
-   if (ctx->dirty & SWR_NEW_TCSCONSTANTS) {
-      swr_update_constants(ctx, PIPE_SHADER_TESS_CTRL);
-   }
-
-   /* Tessellation Evaluation Shader Constants */
-   if (ctx->dirty & SWR_NEW_TESCONSTANTS) {
-      swr_update_constants(ctx, PIPE_SHADER_TESS_EVAL);
-   }
-
-   /* Depth/stencil state */
-   if (ctx->dirty & (SWR_NEW_DEPTH_STENCIL_ALPHA | SWR_NEW_FRAMEBUFFER)) {
-      struct pipe_depth_stencil_alpha_state *depth = ctx->depth_stencil;
-      struct pipe_stencil_state *stencil = depth->stencil;
-      SWR_DEPTH_STENCIL_STATE depthStencilState = {{0}};
-      SWR_DEPTH_BOUNDS_STATE depthBoundsState = {0};
-
-      /* XXX, incomplete.  Need to flesh out stencil & alpha test state
-      struct pipe_stencil_state *front_stencil =
-      ctx->depth_stencil.stencil[0];
-      struct pipe_stencil_state *back_stencil = ctx->depth_stencil.stencil[1];
-      */
-      if (stencil[0].enabled) {
-         depthStencilState.stencilWriteEnable = 1;
-         depthStencilState.stencilTestEnable = 1;
-         depthStencilState.stencilTestFunc =
-            swr_convert_depth_func(stencil[0].func);
-
-         depthStencilState.stencilPassDepthPassOp =
-            swr_convert_stencil_op(stencil[0].zpass_op);
-         depthStencilState.stencilPassDepthFailOp =
-            swr_convert_stencil_op(stencil[0].zfail_op);
-         depthStencilState.stencilFailOp =
-            swr_convert_stencil_op(stencil[0].fail_op);
-         depthStencilState.stencilWriteMask = stencil[0].writemask;
-         depthStencilState.stencilTestMask = stencil[0].valuemask;
-         depthStencilState.stencilRefValue = ctx->stencil_ref.ref_value[0];
-      }
-      if (stencil[1].enabled) {
-         depthStencilState.doubleSidedStencilTestEnable = 1;
-
-         depthStencilState.backfaceStencilTestFunc =
-            swr_convert_depth_func(stencil[1].func);
-
-         depthStencilState.backfaceStencilPassDepthPassOp =
-            swr_convert_stencil_op(stencil[1].zpass_op);
-         depthStencilState.backfaceStencilPassDepthFailOp =
-            swr_convert_stencil_op(stencil[1].zfail_op);
-         depthStencilState.backfaceStencilFailOp =
-            swr_convert_stencil_op(stencil[1].fail_op);
-         depthStencilState.backfaceStencilWriteMask = stencil[1].writemask;
-         depthStencilState.backfaceStencilTestMask = stencil[1].valuemask;
-
-         depthStencilState.backfaceStencilRefValue =
-            ctx->stencil_ref.ref_value[1];
-      }
-
-      depthStencilState.depthTestEnable = depth->depth_enabled;
-      depthStencilState.depthTestFunc = swr_convert_depth_func(depth->depth_func);
-      depthStencilState.depthWriteEnable = depth->depth_writemask;
-      ctx->api.pfnSwrSetDepthStencilState(ctx->swrContext, &depthStencilState);
-
-      depthBoundsState.depthBoundsTestEnable = depth->depth_bounds_test;
-      depthBoundsState.depthBoundsTestMinValue = depth->depth_bounds_min;
-      depthBoundsState.depthBoundsTestMaxValue = depth->depth_bounds_max;
-      ctx->api.pfnSwrSetDepthBoundsState(ctx->swrContext, &depthBoundsState);
-   }
-
-   /* Blend State */
-   if (ctx->dirty & (SWR_NEW_BLEND |
-                     SWR_NEW_RASTERIZER |
-                     SWR_NEW_FRAMEBUFFER |
-                     SWR_NEW_DEPTH_STENCIL_ALPHA)) {
-      struct pipe_framebuffer_state *fb = &ctx->framebuffer;
-
-      SWR_BLEND_STATE blendState;
-      memcpy(&blendState, &ctx->blend->blendState, sizeof(blendState));
-      blendState.constantColor[0] = ctx->blend_color.color[0];
-      blendState.constantColor[1] = ctx->blend_color.color[1];
-      blendState.constantColor[2] = ctx->blend_color.color[2];
-      blendState.constantColor[3] = ctx->blend_color.color[3];
-      blendState.alphaTestReference =
-         *((uint32_t*)&ctx->depth_stencil->alpha_ref_value);
-
-      blendState.sampleMask = ctx->sample_mask;
-      blendState.sampleCount = GetSampleCount(fb->samples);
-
-      /* If there are no color buffers bound, disable writes on RT0
-       * and skip loop */
-      if (fb->nr_cbufs == 0) {
-         blendState.renderTarget[0].writeDisableRed = 1;
-         blendState.renderTarget[0].writeDisableGreen = 1;
-         blendState.renderTarget[0].writeDisableBlue = 1;
-         blendState.renderTarget[0].writeDisableAlpha = 1;
-         ctx->api.pfnSwrSetBlendFunc(ctx->swrContext, 0, NULL);
-      }
-      else
-         for (int target = 0;
-               target < std::min(SWR_NUM_RENDERTARGETS,
-                                 PIPE_MAX_COLOR_BUFS);
-               target++) {
-            if (!fb->cbufs[target])
-               continue;
-
-            struct swr_resource *colorBuffer =
-               swr_resource(fb->cbufs[target]->texture);
-
-            BLEND_COMPILE_STATE compileState;
-            memset(&compileState, 0, sizeof(compileState));
-            compileState.format = colorBuffer->swr.format;
-            memcpy(&compileState.blendState,
-                   &ctx->blend->compileState[target],
-                   sizeof(compileState.blendState));
-
-            const SWR_FORMAT_INFO& info = GetFormatInfo(compileState.format);
-            if (compileState.blendState.logicOpEnable &&
-                ((info.type[0] == SWR_TYPE_FLOAT) || info.isSRGB)) {
-               compileState.blendState.logicOpEnable = false;
-            }
-
-            if (info.type[0] == SWR_TYPE_SINT || info.type[0] == SWR_TYPE_UINT)
-               compileState.blendState.blendEnable = false;
-
-            if (compileState.blendState.blendEnable == false &&
-                compileState.blendState.logicOpEnable == false &&
-                ctx->depth_stencil->alpha_enabled == 0) {
-               ctx->api.pfnSwrSetBlendFunc(ctx->swrContext, target, NULL);
-               continue;
-            }
-
-            compileState.desc.alphaTestEnable =
-               ctx->depth_stencil->alpha_enabled;
-            compileState.desc.independentAlphaBlendEnable =
-               (compileState.blendState.sourceBlendFactor !=
-                compileState.blendState.sourceAlphaBlendFactor) ||
-               (compileState.blendState.destBlendFactor !=
-                compileState.blendState.destAlphaBlendFactor) ||
-               (compileState.blendState.colorBlendFunc !=
-                compileState.blendState.alphaBlendFunc);
-            compileState.desc.alphaToCoverageEnable =
-               ctx->blend->pipe.alpha_to_coverage;
-            compileState.desc.sampleMaskEnable = (blendState.sampleMask != 0);
-            compileState.desc.numSamples = fb->samples;
-
-            compileState.alphaTestFunction =
-               swr_convert_depth_func(ctx->depth_stencil->alpha_func);
-            compileState.alphaTestFormat = ALPHA_TEST_FLOAT32; // xxx
-
-            compileState.Canonicalize();
-
-            PFN_BLEND_JIT_FUNC func = NULL;
-            auto search = ctx->blendJIT->find(compileState);
-            if (search != ctx->blendJIT->end()) {
-               func = search->second;
-            } else {
-               HANDLE hJitMgr = screen->hJitMgr;
-               func = JitCompileBlend(hJitMgr, compileState);
-               debug_printf("BLEND shader %p\n", func);
-               assert(func && "Error: BlendShader = NULL");
-
-               ctx->blendJIT->insert(std::make_pair(compileState, func));
-            }
-            ctx->api.pfnSwrSetBlendFunc(ctx->swrContext, target, func);
-         }
-
-      ctx->api.pfnSwrSetBlendState(ctx->swrContext, &blendState);
-   }
-
-   if (ctx->dirty & SWR_NEW_STIPPLE) {
-      swr_update_poly_stipple(ctx);
-   }
-
-   if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_TCS | SWR_NEW_TES | SWR_NEW_SO | SWR_NEW_RASTERIZER)) {
-      ctx->vs->soState.rasterizerDisable =
-         ctx->rasterizer->rasterizer_discard;
-      ctx->api.pfnSwrSetSoState(ctx->swrContext, &ctx->vs->soState);
-
-      pipe_stream_output_info *stream_output = &ctx->vs->pipe.stream_output;
-
-      for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
-         SWR_STREAMOUT_BUFFER buffer = {0};
-         if (ctx->so_targets[i]) {
-             buffer.enable = true;
-             buffer.pBuffer =
-                (gfxptr_t)(swr_resource_data(ctx->so_targets[i]->buffer) +
-                             ctx->so_targets[i]->buffer_offset);
-             buffer.bufferSize = ctx->so_targets[i]->buffer_size >> 2;
-             buffer.pitch = stream_output->stride[i];
-             buffer.streamOffset = 0;
-	 }
-
-         ctx->api.pfnSwrSetSoBuffers(ctx->swrContext, &buffer, i);
-      }
-   }
-
-
-   if (ctx->dirty & (SWR_NEW_CLIP | SWR_NEW_RASTERIZER | SWR_NEW_VS)) {
-      // shader exporting clip distances overrides all user clip planes
-      if (ctx->rasterizer->clip_plane_enable &&
-          !swr_get_last_fe(ctx)->num_written_clipdistance)
-      {
-         swr_draw_context *pDC = &ctx->swrDC;
-         memcpy(pDC->userClipPlanes,
-                ctx->clip.ucp,
-                sizeof(pDC->userClipPlanes));
-      }
-   }
-
-   // set up backend state
-   SWR_BACKEND_STATE backendState = {0};
-   if (ctx->gs) {
-      backendState.numAttributes = ctx->gs->info.base.num_outputs - 1;
-   } else
-   if (ctx->tes) {
-      backendState.numAttributes = ctx->tes->info.base.num_outputs - 1;
-      // no case for TCS, because if TCS is active, TES must be active
-      // as well - pipeline stages after tessellation does not support patches
-   }  else {
-      backendState.numAttributes = ctx->vs->info.base.num_outputs - 1;
-      if (ctx->fs->info.base.uses_primid) {
-         backendState.numAttributes++;
-         backendState.swizzleEnable = true;
-         for (unsigned i = 0; i < sizeof(backendState.numComponents); i++) {
-            backendState.swizzleMap[i].sourceAttrib = i;
-         }
-         backendState.swizzleMap[ctx->vs->info.base.num_outputs - 1].constantSource =
-            SWR_CONSTANT_SOURCE_PRIM_ID;
-         backendState.swizzleMap[ctx->vs->info.base.num_outputs - 1].componentOverrideMask = 1;
-      }
-   }
-   if (ctx->rasterizer->sprite_coord_enable)
-      backendState.numAttributes++;
-
-   backendState.numAttributes = std::min((size_t)backendState.numAttributes,
-                                         sizeof(backendState.numComponents));
-   for (unsigned i = 0; i < backendState.numAttributes; i++)
-      backendState.numComponents[i] = 4;
-   backendState.constantInterpolationMask = ctx->fs->constantMask |
-      (ctx->rasterizer->flatshade ? ctx->fs->flatConstantMask : 0);
-   backendState.pointSpriteTexCoordMask = ctx->fs->pointSpriteMask;
-
-   struct tgsi_shader_info *pLastFE = swr_get_last_fe(ctx);
-
-   backendState.readRenderTargetArrayIndex = pLastFE->writes_layer;
-   backendState.readViewportArrayIndex = pLastFE->writes_viewport_index;
-   backendState.vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize
-
-   backendState.clipDistanceMask =
-      pLastFE->num_written_clipdistance ?
-      pLastFE->clipdist_writemask & ctx->rasterizer->clip_plane_enable :
-      ctx->rasterizer->clip_plane_enable;
-
-   backendState.cullDistanceMask =
-      pLastFE->culldist_writemask << pLastFE->num_written_clipdistance;
-
-   // Assume old layout of SGV, POSITION, CLIPCULL, ATTRIB
-   backendState.vertexClipCullOffset = backendState.vertexAttribOffset - 2;
-
-   ctx->api.pfnSwrSetBackendState(ctx->swrContext, &backendState);
-
-   /* Ensure that any in-progress attachment change StoreTiles finish */
-   if (swr_is_fence_pending(screen->flush_fence))
-      swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0);
-
-   /* Finally, update the in-use status of all resources involved in draw */
-   swr_update_resource_status(pipe, p_draw_info);
-
-   ctx->dirty = post_update_dirty_flags;
-}
-
-
-static struct pipe_stream_output_target *
-swr_create_so_target(struct pipe_context *pipe,
-                     struct pipe_resource *buffer,
-                     unsigned buffer_offset,
-                     unsigned buffer_size)
-{
-   struct pipe_stream_output_target *target;
-
-   target = CALLOC_STRUCT(pipe_stream_output_target);
-   if (!target)
-      return NULL;
-
-   target->context = pipe;
-   target->reference.count = 1;
-   pipe_resource_reference(&target->buffer, buffer);
-   target->buffer_offset = buffer_offset;
-   target->buffer_size = buffer_size;
-   return target;
-}
-
-static void
-swr_destroy_so_target(struct pipe_context *pipe,
-                      struct pipe_stream_output_target *target)
-{
-   pipe_resource_reference(&target->buffer, NULL);
-   FREE(target);
-}
-
-static void
-swr_set_so_targets(struct pipe_context *pipe,
-                   unsigned num_targets,
-                   struct pipe_stream_output_target **targets,
-                   const unsigned *offsets)
-{
-   struct swr_context *swr = swr_context(pipe);
-   uint32_t i;
-
-   assert(num_targets <= MAX_SO_STREAMS);
-
-   for (i = 0; i < num_targets; i++) {
-      pipe_so_target_reference(
-         (struct pipe_stream_output_target **)&swr->so_targets[i],
-         targets[i]);
-   }
-
-   for (/* fall-through */; i < swr->num_so_targets; i++) {
-      pipe_so_target_reference(
-         (struct pipe_stream_output_target **)&swr->so_targets[i], NULL);
-   }
-
-   swr->num_so_targets = num_targets;
-   swr->swrDC.soPrims = &swr->so_primCounter;
-
-   swr->dirty |= SWR_NEW_SO;
-}
-
-static void
-swr_set_patch_vertices(struct pipe_context *pipe, uint8_t patch_vertices)
-{
-   struct swr_context *swr = swr_context(pipe);
-
-   swr->patch_vertices = patch_vertices;
-}
-
-
-void
-swr_state_init(struct pipe_context *pipe)
-{
-   pipe->create_blend_state = swr_create_blend_state;
-   pipe->bind_blend_state = swr_bind_blend_state;
-   pipe->delete_blend_state = swr_delete_blend_state;
-
-   pipe->create_depth_stencil_alpha_state = swr_create_depth_stencil_state;
-   pipe->bind_depth_stencil_alpha_state = swr_bind_depth_stencil_state;
-   pipe->delete_depth_stencil_alpha_state = swr_delete_depth_stencil_state;
-
-   pipe->create_rasterizer_state = swr_create_rasterizer_state;
-   pipe->bind_rasterizer_state = swr_bind_rasterizer_state;
-   pipe->delete_rasterizer_state = swr_delete_rasterizer_state;
-
-   pipe->create_sampler_state = swr_create_sampler_state;
-   pipe->bind_sampler_states = swr_bind_sampler_states;
-   pipe->delete_sampler_state = swr_delete_sampler_state;
-
-   pipe->create_sampler_view = swr_create_sampler_view;
-   pipe->set_sampler_views = swr_set_sampler_views;
-   pipe->sampler_view_destroy = swr_sampler_view_destroy;
-
-   pipe->create_vs_state = swr_create_vs_state;
-   pipe->bind_vs_state = swr_bind_vs_state;
-   pipe->delete_vs_state = swr_delete_vs_state;
-
-   pipe->create_fs_state = swr_create_fs_state;
-   pipe->bind_fs_state = swr_bind_fs_state;
-   pipe->delete_fs_state = swr_delete_fs_state;
-
-   pipe->create_gs_state = swr_create_gs_state;
-   pipe->bind_gs_state = swr_bind_gs_state;
-   pipe->delete_gs_state = swr_delete_gs_state;
-
-   pipe->create_tcs_state = swr_create_tcs_state;
-   pipe->bind_tcs_state = swr_bind_tcs_state;
-   pipe->delete_tcs_state = swr_delete_tcs_state;
-
-   pipe->create_tes_state = swr_create_tes_state;
-   pipe->bind_tes_state = swr_bind_tes_state;
-   pipe->delete_tes_state = swr_delete_tes_state;
-
-   pipe->set_constant_buffer = swr_set_constant_buffer;
-
-   pipe->create_vertex_elements_state = swr_create_vertex_elements_state;
-   pipe->bind_vertex_elements_state = swr_bind_vertex_elements_state;
-   pipe->delete_vertex_elements_state = swr_delete_vertex_elements_state;
-
-   pipe->set_vertex_buffers = swr_set_vertex_buffers;
-
-   pipe->set_polygon_stipple = swr_set_polygon_stipple;
-   pipe->set_clip_state = swr_set_clip_state;
-   pipe->set_scissor_states = swr_set_scissor_states;
-   pipe->set_viewport_states = swr_set_viewport_states;
-
-   pipe->set_framebuffer_state = swr_set_framebuffer_state;
-
-   pipe->set_blend_color = swr_set_blend_color;
-   pipe->set_stencil_ref = swr_set_stencil_ref;
-
-   pipe->set_sample_mask = swr_set_sample_mask;
-   pipe->get_sample_position = swr_get_sample_position;
-
-   pipe->create_stream_output_target = swr_create_so_target;
-   pipe->stream_output_target_destroy = swr_destroy_so_target;
-   pipe->set_stream_output_targets = swr_set_so_targets;
-
-   pipe->set_patch_vertices = swr_set_patch_vertices;
-}
diff --git a/src/gallium/drivers/swr/swr_state.h b/src/gallium/drivers/swr/swr_state.h
deleted file mode 100644
index 75a70de0b1a..00000000000
--- a/src/gallium/drivers/swr/swr_state.h
+++ /dev/null
@@ -1,426 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ***************************************************************************/
-
-#ifndef SWR_STATE_H
-#define SWR_STATE_H
-
-#include "pipe/p_defines.h"
-#include "tgsi/tgsi_scan.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_dump.h"
-#include "gallivm/lp_bld_init.h"
-#include "gallivm/lp_bld_tgsi.h"
-#include "util/crc32.h"
-#include "api.h"
-#include "swr_tex_sample.h"
-#include "swr_shader.h"
-#include <unordered_map>
-#include <memory>
-
-template <typename T>
-struct ShaderVariant {
-   struct gallivm_state *gallivm;
-   T shader;
-
-   ShaderVariant(struct gallivm_state *gs, T code) : gallivm(gs), shader(code) {}
-   ~ShaderVariant() { gallivm_destroy(gallivm); }
-};
-
-using PFN_TCS_FUNC = PFN_HS_FUNC;
-using PFN_TES_FUNC = PFN_DS_FUNC;
-
-typedef ShaderVariant<PFN_VERTEX_FUNC> VariantVS;
-typedef ShaderVariant<PFN_PIXEL_KERNEL> VariantFS;
-typedef ShaderVariant<PFN_GS_FUNC> VariantGS;
-typedef ShaderVariant<PFN_TCS_FUNC> VariantTCS;
-typedef ShaderVariant<PFN_TES_FUNC> VariantTES;
-
-/* skeleton */
-struct swr_vertex_shader {
-   struct pipe_shader_state pipe;
-   struct lp_tgsi_info info;
-   std::unordered_map<swr_jit_vs_key, std::unique_ptr<VariantVS>> map;
-   SWR_STREAMOUT_STATE soState;
-   PFN_SO_FUNC soFunc[PIPE_PRIM_MAX] {0};
-};
-
-struct swr_fragment_shader {
-   struct pipe_shader_state pipe;
-   struct lp_tgsi_info info;
-   uint32_t constantMask;
-   uint32_t flatConstantMask;
-   uint32_t pointSpriteMask;
-   std::unordered_map<swr_jit_fs_key, std::unique_ptr<VariantFS>> map;
-};
-
-struct swr_geometry_shader {
-   struct pipe_shader_state pipe;
-   struct lp_tgsi_info info;
-   SWR_GS_STATE gsState;
-
-   std::unordered_map<swr_jit_gs_key, std::unique_ptr<VariantGS>> map;
-};
-
-struct swr_tess_control_shader {
-   struct pipe_shader_state pipe;
-   struct lp_tgsi_info info;
-   uint32_t vertices_per_patch;
-
-   std::unordered_map<swr_jit_tcs_key, std::unique_ptr<VariantTCS>> map;
-};
-
-struct swr_tess_evaluation_shader {
-   struct pipe_shader_state pipe;
-   struct lp_tgsi_info info;
-   SWR_TS_STATE ts_state;
-
-   std::unordered_map<swr_jit_tes_key, std::unique_ptr<VariantTES>> map;
-};
-
-
-/* Vertex element state */
-struct swr_vertex_element_state {
-   FETCH_COMPILE_STATE fsState;
-   PFN_FETCH_FUNC fsFunc {NULL};
-   uint32_t stream_pitch[PIPE_MAX_ATTRIBS] {0};
-   uint32_t min_instance_div[PIPE_MAX_ATTRIBS] {0};
-   uint32_t instanced_bufs {0};
-   std::unordered_map<swr_jit_fetch_key, PFN_FETCH_FUNC> map;
-};
-
-struct swr_blend_state {
-   struct pipe_blend_state pipe;
-   SWR_BLEND_STATE blendState;
-   RENDER_TARGET_BLEND_COMPILE_STATE compileState[PIPE_MAX_COLOR_BUFS];
-};
-
-struct swr_poly_stipple {
-   struct pipe_poly_stipple pipe;
-   bool prim_is_poly;
-};
-
-/*
- * Derived SWR API DrawState
- * For convenience of making simple changes without re-deriving state.
- */
-struct swr_derived_state {
-   SWR_RASTSTATE rastState;
-   SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS];
-   SWR_VIEWPORT_MATRICES vpm;
-};
-
-void swr_update_derived(struct pipe_context *,
-                        const struct pipe_draw_info * = nullptr,
-                        const struct pipe_draw_start_count_bias *draw = nullptr);
-
-/*
- * Conversion functions: Convert mesa state defines to SWR.
- */
-
-static INLINE SWR_LOGIC_OP
-swr_convert_logic_op(const UINT op)
-{
-   switch (op) {
-   case PIPE_LOGICOP_CLEAR:
-      return LOGICOP_CLEAR;
-   case PIPE_LOGICOP_NOR:
-      return LOGICOP_NOR;
-   case PIPE_LOGICOP_AND_INVERTED:
-      return LOGICOP_AND_INVERTED;
-   case PIPE_LOGICOP_COPY_INVERTED:
-      return LOGICOP_COPY_INVERTED;
-   case PIPE_LOGICOP_AND_REVERSE:
-      return LOGICOP_AND_REVERSE;
-   case PIPE_LOGICOP_INVERT:
-      return LOGICOP_INVERT;
-   case PIPE_LOGICOP_XOR:
-      return LOGICOP_XOR;
-   case PIPE_LOGICOP_NAND:
-      return LOGICOP_NAND;
-   case PIPE_LOGICOP_AND:
-      return LOGICOP_AND;
-   case PIPE_LOGICOP_EQUIV:
-      return LOGICOP_EQUIV;
-   case PIPE_LOGICOP_NOOP:
-      return LOGICOP_NOOP;
-   case PIPE_LOGICOP_OR_INVERTED:
-      return LOGICOP_OR_INVERTED;
-   case PIPE_LOGICOP_COPY:
-      return LOGICOP_COPY;
-   case PIPE_LOGICOP_OR_REVERSE:
-      return LOGICOP_OR_REVERSE;
-   case PIPE_LOGICOP_OR:
-      return LOGICOP_OR;
-   case PIPE_LOGICOP_SET:
-      return LOGICOP_SET;
-   default:
-      assert(0 && "Unsupported logic op");
-      return LOGICOP_NOOP;
-   }
-}
-
-static INLINE SWR_STENCILOP
-swr_convert_stencil_op(const UINT op)
-{
-   switch (op) {
-   case PIPE_STENCIL_OP_KEEP:
-      return STENCILOP_KEEP;
-   case PIPE_STENCIL_OP_ZERO:
-      return STENCILOP_ZERO;
-   case PIPE_STENCIL_OP_REPLACE:
-      return STENCILOP_REPLACE;
-   case PIPE_STENCIL_OP_INCR:
-      return STENCILOP_INCRSAT;
-   case PIPE_STENCIL_OP_DECR:
-      return STENCILOP_DECRSAT;
-   case PIPE_STENCIL_OP_INCR_WRAP:
-      return STENCILOP_INCR;
-   case PIPE_STENCIL_OP_DECR_WRAP:
-      return STENCILOP_DECR;
-   case PIPE_STENCIL_OP_INVERT:
-      return STENCILOP_INVERT;
-   default:
-      assert(0 && "Unsupported stencil op");
-      return STENCILOP_KEEP;
-   }
-}
-
-static INLINE SWR_FORMAT
-swr_convert_index_type(const UINT index_size)
-{
-   switch (index_size) {
-   case sizeof(unsigned char):
-      return R8_UINT;
-   case sizeof(unsigned short):
-      return R16_UINT;
-   case sizeof(unsigned int):
-      return R32_UINT;
-   default:
-      assert(0 && "Unsupported index type");
-      return R32_UINT;
-   }
-}
-
-
-static INLINE SWR_ZFUNCTION
-swr_convert_depth_func(const UINT pipe_func)
-{
-   switch (pipe_func) {
-   case PIPE_FUNC_NEVER:
-      return ZFUNC_NEVER;
-   case PIPE_FUNC_LESS:
-      return ZFUNC_LT;
-   case PIPE_FUNC_EQUAL:
-      return ZFUNC_EQ;
-   case PIPE_FUNC_LEQUAL:
-      return ZFUNC_LE;
-   case PIPE_FUNC_GREATER:
-      return ZFUNC_GT;
-   case PIPE_FUNC_NOTEQUAL:
-      return ZFUNC_NE;
-   case PIPE_FUNC_GEQUAL:
-      return ZFUNC_GE;
-   case PIPE_FUNC_ALWAYS:
-      return ZFUNC_ALWAYS;
-   default:
-      assert(0 && "Unsupported depth func");
-      return ZFUNC_ALWAYS;
-   }
-}
-
-
-static INLINE SWR_CULLMODE
-swr_convert_cull_mode(const UINT cull_face)
-{
-   switch (cull_face) {
-   case PIPE_FACE_NONE:
-      return SWR_CULLMODE_NONE;
-   case PIPE_FACE_FRONT:
-      return SWR_CULLMODE_FRONT;
-   case PIPE_FACE_BACK:
-      return SWR_CULLMODE_BACK;
-   case PIPE_FACE_FRONT_AND_BACK:
-      return SWR_CULLMODE_BOTH;
-   default:
-      assert(0 && "Invalid cull mode");
-      return SWR_CULLMODE_NONE;
-   }
-}
-
-static INLINE SWR_BLEND_OP
-swr_convert_blend_func(const UINT blend_func)
-{
-   switch (blend_func) {
-   case PIPE_BLEND_ADD:
-      return BLENDOP_ADD;
-   case PIPE_BLEND_SUBTRACT:
-      return BLENDOP_SUBTRACT;
-   case PIPE_BLEND_REVERSE_SUBTRACT:
-      return BLENDOP_REVSUBTRACT;
-   case PIPE_BLEND_MIN:
-      return BLENDOP_MIN;
-   case PIPE_BLEND_MAX:
-      return BLENDOP_MAX;
-   default:
-      assert(0 && "Invalid blend func");
-      return BLENDOP_ADD;
-   }
-}
-
-static INLINE SWR_BLEND_FACTOR
-swr_convert_blend_factor(const UINT blend_factor)
-{
-   switch (blend_factor) {
-   case PIPE_BLENDFACTOR_ONE:
-      return BLENDFACTOR_ONE;
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-      return BLENDFACTOR_SRC_COLOR;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-      return BLENDFACTOR_SRC_ALPHA;
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-      return BLENDFACTOR_DST_ALPHA;
-   case PIPE_BLENDFACTOR_DST_COLOR:
-      return BLENDFACTOR_DST_COLOR;
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      return BLENDFACTOR_SRC_ALPHA_SATURATE;
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-      return BLENDFACTOR_CONST_COLOR;
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-      return BLENDFACTOR_CONST_ALPHA;
-   case PIPE_BLENDFACTOR_SRC1_COLOR:
-      return BLENDFACTOR_SRC1_COLOR;
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-      return BLENDFACTOR_SRC1_ALPHA;
-   case PIPE_BLENDFACTOR_ZERO:
-      return BLENDFACTOR_ZERO;
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      return BLENDFACTOR_INV_SRC_COLOR;
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      return BLENDFACTOR_INV_SRC_ALPHA;
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      return BLENDFACTOR_INV_DST_ALPHA;
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-      return BLENDFACTOR_INV_DST_COLOR;
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      return BLENDFACTOR_INV_CONST_COLOR;
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      return BLENDFACTOR_INV_CONST_ALPHA;
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      return BLENDFACTOR_INV_SRC1_COLOR;
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      return BLENDFACTOR_INV_SRC1_ALPHA;
-   default:
-      assert(0 && "Invalid blend factor");
-      return BLENDFACTOR_ONE;
-   }
-}
-
-static INLINE enum SWR_SURFACE_TYPE
-swr_convert_target_type(const enum pipe_texture_target target)
-{
-   switch (target) {
-   case PIPE_BUFFER:
-      return SURFACE_BUFFER;
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-      return SURFACE_1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_2D_ARRAY:
-   case PIPE_TEXTURE_RECT:
-      return SURFACE_2D;
-   case PIPE_TEXTURE_3D:
-      return SURFACE_3D;
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-      return SURFACE_CUBE;
-   default:
-      assert(0);
-      return SURFACE_NULL;
-   }
-}
-
-/*
- * Convert mesa PIPE_PRIM_X to SWR enum PRIMITIVE_TOPOLOGY
- */
-static INLINE enum PRIMITIVE_TOPOLOGY
-swr_convert_prim_topology(const unsigned mode, const unsigned tcs_verts)
-{
-   switch (mode) {
-   case PIPE_PRIM_POINTS:
-      return TOP_POINT_LIST;
-   case PIPE_PRIM_LINES:
-      return TOP_LINE_LIST;
-   case PIPE_PRIM_LINE_LOOP:
-      return TOP_LINE_LOOP;
-   case PIPE_PRIM_LINE_STRIP:
-      return TOP_LINE_STRIP;
-   case PIPE_PRIM_TRIANGLES:
-      return TOP_TRIANGLE_LIST;
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      return TOP_TRIANGLE_STRIP;
-   case PIPE_PRIM_TRIANGLE_FAN:
-      return TOP_TRIANGLE_FAN;
-   case PIPE_PRIM_QUADS:
-      return TOP_QUAD_LIST;
-   case PIPE_PRIM_QUAD_STRIP:
-      return TOP_QUAD_STRIP;
-   case PIPE_PRIM_POLYGON:
-      return TOP_TRIANGLE_FAN; /* XXX TOP_POLYGON; */
-   case PIPE_PRIM_LINES_ADJACENCY:
-      return TOP_LINE_LIST_ADJ;
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-      return TOP_LISTSTRIP_ADJ;
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-      return TOP_TRI_LIST_ADJ;
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      return TOP_TRI_STRIP_ADJ;
-   case PIPE_PRIM_PATCHES:
-      // rasterizer has a separate type for each possible number of patch vertices
-      return (PRIMITIVE_TOPOLOGY)((unsigned)TOP_PATCHLIST_BASE + tcs_verts);
-   default:
-      assert(0 && "Unknown topology");
-      return TOP_UNKNOWN;
-   }
-};
-
-/*
- * convert mesa PIPE_POLYGON_MODE_X to SWR enum SWR_FILLMODE
- */
-static INLINE enum SWR_FILLMODE
-swr_convert_fill_mode(const unsigned mode)
-{
-   switch(mode) {
-   case PIPE_POLYGON_MODE_FILL:
-      return SWR_FILLMODE_SOLID;
-   case PIPE_POLYGON_MODE_LINE:
-      return SWR_FILLMODE_WIREFRAME;
-   case PIPE_POLYGON_MODE_POINT:
-      return SWR_FILLMODE_POINT;
-   default:
-      assert(0 && "Unknown fillmode");
-      return SWR_FILLMODE_SOLID; // at least do something sensible
-   }
-}
-
-
-#endif
diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp
deleted file mode 100644
index 1cf00b29249..00000000000
--- a/src/gallium/drivers/swr/swr_tex_sample.cpp
+++ /dev/null
@@ -1,376 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * Largely a copy of llvmpipe's lp_tex_sample.c
- */
-
-/**
- * Texture sampling code generation
- *
- * This file is nothing more than ugly glue between three largely independent
- * entities:
- * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa)
- * - texture sampling code generation (i.e., lp_build_sample_soa)
- * - SWR driver
- *
- * All interesting code is in the functions mentioned above. There is really
- * nothing to see here.
- *
- * @author Jose Fonseca <jfonseca@vmware.com>
- */
-
-#include "state.h"
-#include "JitManager.h"
-#include "gen_state_llvm.h"
-
-#include "pipe/p_defines.h"
-#include "pipe/p_shader_tokens.h"
-#include "gallivm/lp_bld_debug.h"
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_type.h"
-#include "gallivm/lp_bld_sample.h"
-#include "gallivm/lp_bld_tgsi.h"
-#include "util/u_memory.h"
-
-#include "swr_tex_sample.h"
-#include "gen_surf_state_llvm.h"
-#include "gen_swr_context_llvm.h"
-
-using namespace SwrJit;
-
-/**
- * This provides the bridge between the sampler state store in
- * lp_jit_context and lp_jit_texture and the sampler code
- * generator. It provides the texture layout information required by
- * the texture sampler code generator in terms of the state stored in
- * lp_jit_context and lp_jit_texture in runtime.
- */
-struct swr_sampler_dynamic_state {
-   struct lp_sampler_dynamic_state base;
-
-   const struct swr_sampler_static_state *static_state;
-
-   enum pipe_shader_type shader_type;
-};
-
-
-/**
- * This is the bridge between our sampler and the TGSI translator.
- */
-struct swr_sampler_soa {
-   struct lp_build_sampler_soa base;
-
-   struct swr_sampler_dynamic_state dynamic_state;
-};
-
-
-/**
- * Fetch the specified member of the lp_jit_texture structure.
- * \param emit_load  if TRUE, emit the LLVM load instruction to actually
- *                   fetch the field's value.  Otherwise, just emit the
- *                   GEP code to address the field.
- *
- * @sa http://llvm.org/docs/GetElementPtr.html
- */
-static LLVMValueRef
-swr_texture_member(const struct lp_sampler_dynamic_state *base,
-                   struct gallivm_state *gallivm,
-                   LLVMValueRef context_ptr,
-                   unsigned texture_unit,
-                   unsigned member_index,
-                   const char *member_name,
-                   boolean emit_load)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef indices[4];
-   LLVMValueRef ptr;
-   LLVMValueRef res;
-
-   assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS);
-
-   /* context[0] */
-   indices[0] = lp_build_const_int32(gallivm, 0);
-   /* context[0].textures */
-   auto dynamic = (const struct swr_sampler_dynamic_state *)base;
-   switch (dynamic->shader_type) {
-   case PIPE_SHADER_FRAGMENT:
-      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS);
-      break;
-   case PIPE_SHADER_VERTEX:
-      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesVS);
-      break;
-   case PIPE_SHADER_GEOMETRY:
-      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesGS);
-      break;
-   case PIPE_SHADER_TESS_CTRL:
-      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesTCS);
-      break;
-   case PIPE_SHADER_TESS_EVAL:
-      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesTES);
-      break;
-   default:
-      assert(0 && "unsupported shader type");
-      break;
-   }
-   /* context[0].textures[unit] */
-   indices[2] = lp_build_const_int32(gallivm, texture_unit);
-   /* context[0].textures[unit].member */
-   indices[3] = lp_build_const_int32(gallivm, member_index);
-
-   ptr = LLVMBuildGEP(builder, context_ptr, indices, ARRAY_SIZE(indices), "");
-
-   if (emit_load)
-      res = LLVMBuildLoad(builder, ptr, "");
-   else
-      res = ptr;
-
-   lp_build_name(res, "context.texture%u.%s", texture_unit, member_name);
-
-   return res;
-}
-
-
-/**
- * Helper macro to instantiate the functions that generate the code to
- * fetch the members of lp_jit_texture to fulfill the sampler code
- * generator requests.
- *
- * This complexity is the price we have to pay to keep the texture
- * sampler code generator a reusable module without dependencies to
- * swr internals.
- */
-#define SWR_TEXTURE_MEMBER(_name, _emit_load)                                \
-   static LLVMValueRef swr_texture_##_name(                                  \
-      const struct lp_sampler_dynamic_state *base,                           \
-      struct gallivm_state *gallivm,                                         \
-      LLVMValueRef context_ptr,                                              \
-      unsigned texture_unit,                                                 \
-      LLVMValueRef texture_unit_offset)                                      \
-   {                                                                         \
-      return swr_texture_member(base,                                        \
-                                gallivm,                                     \
-                                context_ptr,                                 \
-                                texture_unit,                                \
-                                swr_jit_texture_##_name,                     \
-                                #_name,                                      \
-                                _emit_load);                                 \
-   }
-
-
-SWR_TEXTURE_MEMBER(width, TRUE)
-SWR_TEXTURE_MEMBER(height, TRUE)
-SWR_TEXTURE_MEMBER(depth, TRUE)
-SWR_TEXTURE_MEMBER(first_level, TRUE)
-SWR_TEXTURE_MEMBER(last_level, TRUE)
-SWR_TEXTURE_MEMBER(base_ptr, TRUE)
-SWR_TEXTURE_MEMBER(num_samples, TRUE)
-SWR_TEXTURE_MEMBER(sample_stride, TRUE)
-SWR_TEXTURE_MEMBER(row_stride, FALSE)
-SWR_TEXTURE_MEMBER(img_stride, FALSE)
-SWR_TEXTURE_MEMBER(mip_offsets, FALSE)
-
-
-/**
- * Fetch the specified member of the lp_jit_sampler structure.
- * \param emit_load  if TRUE, emit the LLVM load instruction to actually
- *                   fetch the field's value.  Otherwise, just emit the
- *                   GEP code to address the field.
- *
- * @sa http://llvm.org/docs/GetElementPtr.html
- */
-static LLVMValueRef
-swr_sampler_member(const struct lp_sampler_dynamic_state *base,
-                   struct gallivm_state *gallivm,
-                   LLVMValueRef context_ptr,
-                   unsigned sampler_unit,
-                   unsigned member_index,
-                   const char *member_name,
-                   boolean emit_load)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef indices[4];
-   LLVMValueRef ptr;
-   LLVMValueRef res;
-
-   assert(sampler_unit < PIPE_MAX_SAMPLERS);
-
-   /* context[0] */
-   indices[0] = lp_build_const_int32(gallivm, 0);
-   /* context[0].samplers */
-   auto dynamic = (const struct swr_sampler_dynamic_state *)base;
-   switch (dynamic->shader_type) {
-   case PIPE_SHADER_FRAGMENT:
-      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS);
-      break;
-   case PIPE_SHADER_VERTEX:
-      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersVS);
-      break;
-   case PIPE_SHADER_GEOMETRY:
-      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersGS);
-      break;
-   case PIPE_SHADER_TESS_CTRL:
-      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersTCS);
-      break;
-   case PIPE_SHADER_TESS_EVAL:
-      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersTES);
-      break;
-   default:
-      assert(0 && "unsupported shader type");
-      break;
-   }
-   /* context[0].samplers[unit] */
-   indices[2] = lp_build_const_int32(gallivm, sampler_unit);
-   /* context[0].samplers[unit].member */
-   indices[3] = lp_build_const_int32(gallivm, member_index);
-
-   ptr = LLVMBuildGEP(builder, context_ptr, indices, ARRAY_SIZE(indices), "");
-
-   if (emit_load)
-      res = LLVMBuildLoad(builder, ptr, "");
-   else
-      res = ptr;
-
-   lp_build_name(res, "context.sampler%u.%s", sampler_unit, member_name);
-
-   return res;
-}
-
-
-#define SWR_SAMPLER_MEMBER(_name, _emit_load)                                \
-   static LLVMValueRef swr_sampler_##_name(                                  \
-      const struct lp_sampler_dynamic_state *base,                           \
-      struct gallivm_state *gallivm,                                         \
-      LLVMValueRef context_ptr,                                              \
-      unsigned sampler_unit)                                                 \
-   {                                                                         \
-      return swr_sampler_member(base,                                        \
-                                gallivm,                                     \
-                                context_ptr,                                 \
-                                sampler_unit,                                \
-                                swr_jit_sampler_##_name,                     \
-                                #_name,                                      \
-                                _emit_load);                                 \
-   }
-
-
-SWR_SAMPLER_MEMBER(min_lod, TRUE)
-SWR_SAMPLER_MEMBER(max_lod, TRUE)
-SWR_SAMPLER_MEMBER(lod_bias, TRUE)
-SWR_SAMPLER_MEMBER(border_color, FALSE)
-
-
-static void
-swr_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
-{
-   FREE(sampler);
-}
-
-
-/**
- * Fetch filtered values from texture.
- * The 'texel' parameter returns four vectors corresponding to R, G, B, A.
- */
-static void
-swr_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
-                                 struct gallivm_state *gallivm,
-                                 const struct lp_sampler_params *params)
-{
-   struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base;
-   unsigned texture_index = params->texture_index;
-   unsigned sampler_index = params->sampler_index;
-
-   assert(sampler_index < PIPE_MAX_SAMPLERS);
-   assert(texture_index < PIPE_MAX_SHADER_SAMPLER_VIEWS);
-
-#if 0
-      lp_build_sample_nop(gallivm, params->type, params->coords, params->texel);
-#else
-   lp_build_sample_soa(
-      &sampler->dynamic_state.static_state[texture_index].texture_state,
-      &sampler->dynamic_state.static_state[sampler_index].sampler_state,
-      &sampler->dynamic_state.base,
-      gallivm,
-      params);
-#endif
-}
-
-/**
- * Fetch the texture size.
- */
-static void
-swr_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
-                                struct gallivm_state *gallivm,
-                                const struct lp_sampler_size_query_params *params)
-{
-   struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base;
-
-   assert(params->texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS);
-
-   lp_build_size_query_soa(
-      gallivm,
-      &sampler->dynamic_state.static_state[params->texture_unit].texture_state,
-      &sampler->dynamic_state.base,
-      params);
-}
-
-
-struct lp_build_sampler_soa *
-swr_sampler_soa_create(const struct swr_sampler_static_state *static_state,
-                       enum pipe_shader_type shader_type)
-{
-   struct swr_sampler_soa *sampler;
-
-   sampler = CALLOC_STRUCT(swr_sampler_soa);
-   if (!sampler)
-      return NULL;
-
-   sampler->base.destroy = swr_sampler_soa_destroy;
-   sampler->base.emit_tex_sample = swr_sampler_soa_emit_fetch_texel;
-   sampler->base.emit_size_query = swr_sampler_soa_emit_size_query;
-   sampler->dynamic_state.base.width = swr_texture_width;
-   sampler->dynamic_state.base.height = swr_texture_height;
-   sampler->dynamic_state.base.depth = swr_texture_depth;
-   sampler->dynamic_state.base.first_level = swr_texture_first_level;
-   sampler->dynamic_state.base.last_level = swr_texture_last_level;
-   sampler->dynamic_state.base.base_ptr = swr_texture_base_ptr;
-   sampler->dynamic_state.base.row_stride = swr_texture_row_stride;
-   sampler->dynamic_state.base.img_stride = swr_texture_img_stride;
-   sampler->dynamic_state.base.mip_offsets = swr_texture_mip_offsets;
-   sampler->dynamic_state.base.num_samples = swr_texture_num_samples;
-   sampler->dynamic_state.base.sample_stride = swr_texture_sample_stride;
-   sampler->dynamic_state.base.min_lod = swr_sampler_min_lod;
-   sampler->dynamic_state.base.max_lod = swr_sampler_max_lod;
-   sampler->dynamic_state.base.lod_bias = swr_sampler_lod_bias;
-   sampler->dynamic_state.base.border_color = swr_sampler_border_color;
-
-   sampler->dynamic_state.static_state = static_state;
-
-   sampler->dynamic_state.shader_type = shader_type;
-
-   return &sampler->base;
-}
diff --git a/src/gallium/drivers/swr/swr_tex_sample.h b/src/gallium/drivers/swr/swr_tex_sample.h
deleted file mode 100644
index 715ca3c3e19..00000000000
--- a/src/gallium/drivers/swr/swr_tex_sample.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#pragma once
-
-#include "gallivm/lp_bld.h"
-
-struct swr_sampler_static_state {
-   /*
-    * These attributes are effectively interleaved for more sane key handling.
-    * However, there might be lots of null space if the amount of samplers and
-    * textures isn't the same.
-    */
-   struct lp_static_sampler_state sampler_state;
-   struct lp_static_texture_state texture_state;
-};
-
-/**
- * Pure-LLVM texture sampling code generator.
- *
- */
-struct lp_build_sampler_soa *
-swr_sampler_soa_create(const struct swr_sampler_static_state *key,
-                       enum pipe_shader_type shader_type);
author	Jan Zielinski <jan.zielinski@intel.com>	2021-06-09 13:19:44 +0200
committer	Marge Bot <emma+marge@anholt.net>	2021-12-06 23:37:50 +0000
commit	855793c6c6bd372ea96681ecbd3f318ad71da223 (patch)
tree	cbd8efc0c9df58d3bdc2ba774cf46dcdcad21162
parent	d22d328859e4a67e6ff738fbd22eaf1d5a09376a (diff)