diff options
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer/core/backend_impl.h')
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/backend_impl.h | 1300 |
1 files changed, 0 insertions, 1300 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h deleted file mode 100644 index 868419c3e4f..00000000000 --- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h +++ /dev/null @@ -1,1300 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * @file backend.h - * - * @brief Backend handles rasterization, pixel shading and output merger - * operations. - * - ******************************************************************************/ -#pragma once - -#include "tilemgr.h" -#include "state.h" -#include "context.h" - - -void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]); -void InitBackendSampleFuncTable( - PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]); - -static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, - SWR_PS_CONTEXT& psContext); - - -enum SWR_BACKEND_FUNCS -{ - SWR_BACKEND_SINGLE_SAMPLE, - SWR_BACKEND_MSAA_PIXEL_RATE, - SWR_BACKEND_MSAA_SAMPLE_RATE, - SWR_BACKEND_FUNCS_MAX, -}; - -#if KNOB_SIMD_WIDTH == 8 -static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; -static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; -static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; -static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; -#define MASK 0xff -#endif - -static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, - float* pUserClipBuffer, - simdscalar const& vI, - simdscalar const& vJ) -{ - simdscalar vClipMask = _simd_setzero_ps(); - uint32_t numClipDistance = _mm_popcnt_u32(clipMask); - - for (uint32_t i = 0; i < numClipDistance; ++i) - { - // pull triangle clip distance values from clip buffer - simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++); - simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++); - simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++); - - // interpolate - simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ); - - // clip if interpolated clip distance is < 0 || NAN - simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ); - - vClipMask = _simd_or_ps(vClipMask, vCull); - } - - return _simd_movemask_ps(vClipMask); -} - -INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) -{ - static const uint32_t RasterTileColorOffsets[16]{ - 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * - 15, - }; - assert(sampleNum < 16); - return RasterTileColorOffsets[sampleNum]; -} - -INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) -{ - static const uint32_t RasterTileDepthOffsets[16]{ - 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * - 15, - }; - assert(sampleNum < 16); - return RasterTileDepthOffsets[sampleNum]; -} - -INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) -{ - static const uint32_t RasterTileStencilOffsets[16]{ - 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * - 15, - }; - assert(sampleNum < 16); - return RasterTileStencilOffsets[sampleNum]; -} - -template <typename T, uint32_t InputCoverage> -struct generateInputCoverage -{ - INLINE generateInputCoverage(const uint64_t* const coverageMask, - uint32_t (&inputMask)[KNOB_SIMD_WIDTH], - const uint32_t sampleMask) - { - // will need to update for avx512 - assert(KNOB_SIMD_WIDTH == 8); - - simdscalari mask[2]; - simdscalari sampleCoverage[2]; - - if (T::bIsCenterPattern) - { - // center coverage is the same for all samples; just broadcast to the sample slots - uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); - if (T::MultisampleT::numSamples == 1) - { - sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); - } - else if (T::MultisampleT::numSamples == 2) - { - sampleCoverage[0] = - _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); - } - else if (T::MultisampleT::numSamples == 4) - { - sampleCoverage[0] = _simd_set_epi32( - 0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); - } - else if (T::MultisampleT::numSamples == 8) - { - sampleCoverage[0] = _simd_set1_epi32(centerCoverage); - } - else if (T::MultisampleT::numSamples == 16) - { - sampleCoverage[0] = _simd_set1_epi32(centerCoverage); - sampleCoverage[1] = _simd_set1_epi32(centerCoverage); - } - } - else - { - simdscalari src = _simd_set1_epi32(0); - simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; - - if (T::MultisampleT::numSamples == 1) - { - mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); - } - else if (T::MultisampleT::numSamples == 2) - { - mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - } - else if (T::MultisampleT::numSamples == 4) - { - mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); - } - else if (T::MultisampleT::numSamples == 8) - { - mask[0] = _simd_set1_epi32(-1); - } - else if (T::MultisampleT::numSamples == 16) - { - mask[0] = _simd_set1_epi32(-1); - mask[1] = _simd_set1_epi32(-1); - index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); - } - - // gather coverage for samples 0-7 - sampleCoverage[0] = - _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), - (const float*)coverageMask, - index0, - _mm256_castsi256_ps(mask[0]), - 8)); - if (T::MultisampleT::numSamples > 8) - { - // gather coverage for samples 8-15 - sampleCoverage[1] = - _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), - (const float*)coverageMask, - index1, - _mm256_castsi256_ps(mask[1]), - 8)); - } - } - - mask[0] = _mm256_set_epi8(-1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - 0xC, - 0x8, - 0x4, - 0x0, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - 0xC, - 0x8, - 0x4, - 0x0); - // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane - simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); - - simdscalari packedCoverage1; - if (T::MultisampleT::numSamples > 8) - { - // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit - // lane - packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); - } - -#if (KNOB_ARCH == KNOB_ARCH_AVX) - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); - simdscalar shufRes = _mm256_shuffle_ps( - _mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - packedCoverage0 = _mm256_castps_si256( - _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); - - simdscalari packedSampleCoverage; - if (T::MultisampleT::numSamples > 8) - { - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); - shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), - _mm256_castsi256_ps(hiToLow), - _MM_SHUFFLE(1, 1, 0, 1)); - shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); - packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps( - _mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); - packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps( - _mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); - } - else - { - packedSampleCoverage = packedCoverage0; - } -#else - simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); - - simdscalari packedSampleCoverage; - if (T::MultisampleT::numSamples > 8) - { - permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); - - // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane - packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); - } - else - { - packedSampleCoverage = packedCoverage0; - } -#endif - - for (int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) - { - // convert packed sample coverage masks into single coverage masks for all samples for - // each pixel in the 4x2 - inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); - - if (!T::bForcedSampleCount) - { - // input coverage has to be anded with sample mask if MSAA isn't forced on - inputMask[i] &= sampleMask; - } - - // shift to the next pixel in the 4x2 - packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); - } - } - - INLINE generateInputCoverage(const uint64_t* const coverageMask, - simdscalar& inputCoverage, - const uint32_t sampleMask) - { - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask); - inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], - inputMask[6], - inputMask[5], - inputMask[4], - inputMask[3], - inputMask[2], - inputMask[1], - inputMask[0])); - } -}; - -template <typename T> -struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE> -{ - INLINE generateInputCoverage(const uint64_t* const coverageMask, - simdscalar& inputCoverage, - const uint32_t sampleMask) - { - // will need to update for avx512 - assert(KNOB_SIMD_WIDTH == 8); - simdscalari vec = _simd_set1_epi32(coverageMask[0]); - const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - vec = _simd_and_si(vec, bit); - vec = _simd_cmplt_epi32(_simd_setzero_si(), vec); - vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec); - inputCoverage = _simd_castsi_ps(vec); - } - - INLINE generateInputCoverage(const uint64_t* const coverageMask, - uint32_t (&inputMask)[KNOB_SIMD_WIDTH], - const uint32_t sampleMask) - { - uint32_t simdCoverage = (coverageMask[0] & MASK); - static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1; - for (int i = 0; i < KNOB_SIMD_WIDTH; i++) - { - // set all samples to covered if conservative coverage mask is set for that pixel - inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0; - } - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Centroid behaves exactly as follows : -// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center -// (even if the sample pattern does not happen to -// have a sample location there). -// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample -// index, where sample coverage is after ANDing the -// coverage with the SampleMask Rasterizer State. -// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to -// fill out 2x2 pixel stamps, the attribute is -// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the -// pixel, then the first sample covered by the SampleMask Rasterizer State is the evaluation -// point.Otherwise (full SampleMask), the pixel center is the evaluation point. -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template <typename T> -INLINE void CalcCentroidPos(SWR_PS_CONTEXT& psContext, - const SWR_MULTISAMPLE_POS& samplePos, - const uint64_t* const coverageMask, - const uint32_t sampleMask, - simdscalar const& vXSamplePosUL, - simdscalar const& vYSamplePosUL) -{ - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask); - - // Case (2) - partially covered pixel - - // scan for first covered sample per pixel in the 4x2 span - unsigned long sampleNum[KNOB_SIMD_WIDTH]; - (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0); - (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0); - (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0); - (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0); - (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0); - (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0); - (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0); - (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0); - - // look up and set the sample offsets from UL pixel corner for first covered sample - simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]), - samplePos.X(sampleNum[6]), - samplePos.X(sampleNum[5]), - samplePos.X(sampleNum[4]), - samplePos.X(sampleNum[3]), - samplePos.X(sampleNum[2]), - samplePos.X(sampleNum[1]), - samplePos.X(sampleNum[0])); - - simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]), - samplePos.Y(sampleNum[6]), - samplePos.Y(sampleNum[5]), - samplePos.Y(sampleNum[4]), - samplePos.Y(sampleNum[3]), - samplePos.Y(sampleNum[2]), - samplePos.Y(sampleNum[1]), - samplePos.Y(sampleNum[0])); - // add sample offset to UL pixel corner - vXSample = _simd_add_ps(vXSamplePosUL, vXSample); - vYSample = _simd_add_ps(vYSamplePosUL, vYSample); - - // Case (1) and case (3b) - All samples covered or not covered with full SampleMask - static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask(); - simdscalari vInputCoveragei = _simd_set_epi32(inputMask[7], - inputMask[6], - inputMask[5], - inputMask[4], - inputMask[3], - inputMask[2], - inputMask[1], - inputMask[0]); - simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); - - static const simdscalari vZero = _simd_setzero_si(); - const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); - simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); - simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); - simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); - - simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); - - // set the centroid position based on results from above - psContext.vX.centroid = - _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); - psContext.vY.centroid = - _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); - - // Case (3a) No samples covered and partial sample mask - simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); - // sample mask should never be all 0's for this case, but handle it anyways - unsigned long firstCoveredSampleMaskSample = 0; - (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) - : (firstCoveredSampleMaskSample = 0); - - simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); - - vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample)); - vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample)); - - // blend in case 3a pixel locations - psContext.vX.centroid = - _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); - psContext.vY.centroid = - _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a)); -} - -INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, - SWR_PS_CONTEXT& psContext, - const simdscalar& vXSamplePosUL, - const simdscalar& vYSamplePosUL) -{ - // evaluate I,J - psContext.vI.centroid = - vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid); - psContext.vJ.centroid = - vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid); - psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet); - psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, - coeffs.vBOneOverW, - coeffs.vCOneOverW, - psContext.vI.centroid, - psContext.vJ.centroid); -} - -INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const& z, float minz, float maxz) -{ - const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz)); - const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz)); - - return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask)); -} - -template <typename T> -INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount) -{ - // RT has to be single sample if we're in forcedMSAA mode - if (T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X)) - { - return 1; - } - // unless we're forced to single sample, in which case we run the OM at the sample count of the - // RT - else if (T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X)) - { - return GetNumSamples(blendSampleCount); - } - // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count - else - { - return T::MultisampleT::numSamples; - } -} - -inline void SetupBarycentricCoeffs(BarycentricCoeffs* coeffs, const SWR_TRIANGLE_DESC& work) -{ - // broadcast scalars - - coeffs->vIa = _simd_broadcast_ss(&work.I[0]); - coeffs->vIb = _simd_broadcast_ss(&work.I[1]); - coeffs->vIc = _simd_broadcast_ss(&work.I[2]); - - coeffs->vJa = _simd_broadcast_ss(&work.J[0]); - coeffs->vJb = _simd_broadcast_ss(&work.J[1]); - coeffs->vJc = _simd_broadcast_ss(&work.J[2]); - - coeffs->vZa = _simd_broadcast_ss(&work.Z[0]); - coeffs->vZb = _simd_broadcast_ss(&work.Z[1]); - coeffs->vZc = _simd_broadcast_ss(&work.Z[2]); - - coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet); - - coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); - coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); - coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); -} - -inline void SetupRenderBuffers(uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS], - uint8_t** pDepthBuffer, - uint8_t** pStencilBuffer, - uint32_t colorHotTileMask, - RenderOutputBuffers& renderBuffers) -{ - unsigned long index; - while (_BitScanForward(&index, colorHotTileMask)) - { - assert(index < SWR_NUM_RENDERTARGETS); - colorHotTileMask &= ~(1 << index); - pColorBuffer[index] = renderBuffers.pColor[index]; - } - - if (pDepthBuffer) - { - *pDepthBuffer = renderBuffers.pDepth; - } - - if (pStencilBuffer) - { - *pStencilBuffer = renderBuffers.pStencil; - ; - } -} - -INLINE void SetRenderHotTilesDirty(DRAW_CONTEXT* pDC, RenderOutputBuffers& renderBuffers) -{ - const API_STATE& state = GetApiState(pDC); - - unsigned long rtSlot = 0; - uint32_t colorHottileEnableMask = state.colorHottileEnable; - while (_BitScanForward(&rtSlot, colorHottileEnableMask)) - { - colorHottileEnableMask &= ~(1 << rtSlot); - renderBuffers.pColorHotTile[rtSlot]->state = HOTTILE_DIRTY; - } -} - -template <typename T> -void SetupPixelShaderContext(SWR_PS_CONTEXT* psContext, - const SWR_MULTISAMPLE_POS& samplePos, - SWR_TRIANGLE_DESC& work) -{ - psContext->pAttribs = work.pAttribs; - psContext->pPerspAttribs = work.pPerspAttribs; - psContext->frontFace = work.triFlags.frontFacing; - psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex; - psContext->viewportIndex = work.triFlags.viewportIndex; - - // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull - // attribs - psContext->I = work.I; - psContext->J = work.J; - - psContext->recipDet = work.recipDet; - psContext->pRecipW = work.pRecipW; - psContext->pSamplePosX = - samplePos.X(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosX); - psContext->pSamplePosY = - samplePos.Y(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosY); - psContext->rasterizerSampleCount = T::MultisampleT::numSamples; - psContext->sampleIndex = 0; -} - -template <typename T, bool IsSingleSample> -void CalcCentroid(SWR_PS_CONTEXT* psContext, - const SWR_MULTISAMPLE_POS& samplePos, - const BarycentricCoeffs& coeffs, - const uint64_t* const coverageMask, - uint32_t sampleMask) -{ - if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid - // positions are still different - { - // for 1x case, centroid is pixel center - psContext->vX.centroid = psContext->vX.center; - psContext->vY.centroid = psContext->vY.center; - psContext->vI.centroid = psContext->vI.center; - psContext->vJ.centroid = psContext->vJ.center; - psContext->vOneOverW.centroid = psContext->vOneOverW.center; - } - else - { - if (T::bCentroidPos) - { - ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid - if (T::bIsCenterPattern) - { - psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f)); - psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f)); - } - else - { - // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate - // coverage 2X'.. - CalcCentroidPos<T>(*psContext, - samplePos, - coverageMask, - sampleMask, - psContext->vX.UL, - psContext->vY.UL); - } - - CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL); - } - else - { - psContext->vX.centroid = psContext->vX.sample; - psContext->vY.centroid = psContext->vY.sample; - } - } -} - -template <typename T> -struct PixelRateZTestLoop -{ - PixelRateZTestLoop(DRAW_CONTEXT* DC, - uint32_t _workerId, - const SWR_TRIANGLE_DESC& Work, - const BarycentricCoeffs& Coeffs, - const API_STATE& apiState, - uint8_t*& depthBuffer, - uint8_t*& stencilBuffer, - const uint8_t ClipDistanceMask) : - pDC(DC), - workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState), - samplePos(state.rastState.samplePositions), clipDistanceMask(ClipDistanceMask), - pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){}; - - INLINE - uint32_t operator()(simdscalar& activeLanes, - SWR_PS_CONTEXT& psContext, - const CORE_BUCKETS BEDepthBucket, - uint32_t currentSimdIn8x8 = 0) - { - - uint32_t statCount = 0; - simdscalar anyDepthSamplePassed = _simd_setzero_ps(); - for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) - { - const uint8_t* pCoverageMask = (uint8_t*)&work.coverageMask[sample]; - vCoverageMask[sample] = - _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK)); - - if (!_simd_movemask_ps(vCoverageMask[sample])) - { - vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = - _simd_setzero_ps(); - continue; - } - - // offset depth/stencil buffers current sample - uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) - { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, - "Unsupported depth hot tile format"); - - const simdscalar z = _simd_load_ps(reinterpret_cast<const float*>(pDepthSample)); - - const float minz = state.depthBoundsState.depthBoundsTestMinValue; - const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; - - vCoverageMask[sample] = - _simd_and_ps(vCoverageMask[sample], - _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz))); - } - - RDTSC_BEGIN(psContext.pBucketManager, BEBarycentric, pDC->drawId); - - // calculate per sample positions - psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); - psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); - - // calc I & J per sample - CalcSampleBarycentrics(coeffs, psContext); - - if (psState.writesODepth) - { - { - // broadcast and test oDepth(psContext.vZ) written from the PS for each sample - vZ[sample] = psContext.vZ; - } - } - else - { - vZ[sample] = vplaneps( - coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); - vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); - } - - RDTSC_END(psContext.pBucketManager, BEBarycentric, 0); - - ///@todo: perspective correct vs non-perspective correct clipping? - // if clip distances are enabled, we need to interpolate for each sample - if (clipDistanceMask) - { - uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, - work.pUserClipBuffer, - psContext.vI.sample, - psContext.vJ.sample); - - vCoverageMask[sample] = - _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask)); - } - - // ZTest for this sample - ///@todo Need to uncomment out this bucket. - // RDTSC_BEGIN(psContext.pBucketManager, BEDepthBucket, pDC->drawId); - depthPassMask[sample] = vCoverageMask[sample]; - stencilPassMask[sample] = vCoverageMask[sample]; - depthPassMask[sample] = DepthStencilTest(&state, - work.triFlags.frontFacing, - work.triFlags.viewportIndex, - vZ[sample], - pDepthSample, - vCoverageMask[sample], - pStencilSample, - &stencilPassMask[sample]); - // RDTSC_END(psContext.pBucketManager, BEDepthBucket, 0); - - // early-exit if no pixels passed depth or earlyZ is forced on - if (psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample])) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - vZ[sample], - pDepthSample, - depthPassMask[sample], - vCoverageMask[sample], - pStencilSample, - stencilPassMask[sample]); - - if (!_simd_movemask_ps(depthPassMask[sample])) - { - continue; - } - } - anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); - uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); - statCount += _mm_popcnt_u32(statMask); - } - - activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes); - // return number of samples that passed depth and coverage - return statCount; - } - - // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite - simdscalar vZ[T::MultisampleT::numCoverageSamples]; - simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples]; - simdscalar depthPassMask[T::MultisampleT::numCoverageSamples]; - simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples]; - -private: - // functor inputs - DRAW_CONTEXT* pDC; - uint32_t workerId; - - const SWR_TRIANGLE_DESC& work; - const BarycentricCoeffs& coeffs; - const API_STATE& state; - const SWR_PS_STATE& psState; - const SWR_MULTISAMPLE_POS& samplePos; - const uint8_t clipDistanceMask; - uint8_t*& pDepthBuffer; - uint8_t*& pStencilBuffer; -}; - -INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT& psContext) -{ - // evaluate I,J - psContext.vI.center = - vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center); - psContext.vJ.center = - vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center); - psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet); - psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, - coeffs.vBOneOverW, - coeffs.vCOneOverW, - psContext.vI.center, - psContext.vJ.center); -} - -static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, - SWR_PS_CONTEXT& psContext) -{ - // evaluate I,J - psContext.vI.sample = - vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample); - psContext.vJ.sample = - vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample); - psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet); - psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, - coeffs.vBOneOverW, - coeffs.vCOneOverW, - psContext.vI.sample, - psContext.vJ.sample); -} - -// Merge Output to 8x2 SIMD16 Tile Format -INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC, - SWR_PS_CONTEXT& psContext, - uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], - uint32_t sample, - const SWR_BLEND_STATE* pBlendState, - const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], - simdscalar& coverageMask, - simdscalar const& depthPassMask, - uint32_t renderTargetMask, - bool useAlternateOffset, - uint32_t workerId) -{ - // type safety guaranteed from template instantiation in BEChooser<>::GetFunc - uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); - - if (useAlternateOffset) - { - rasterTileColorOffset += sizeof(simdscalar); - } - - simdvector blendSrc; - simdvector blendOut; - - unsigned long rt; - while (_BitScanForward(&rt, renderTargetMask)) - { - renderTargetMask &= ~(1 << rt); - - const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt]; - - simdscalar* pColorSample; - bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || - !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue; - if (hotTileEnable) - { - pColorSample = reinterpret_cast<simdscalar*>(pColorBase[rt] + rasterTileColorOffset); - blendSrc[0] = pColorSample[0]; - blendSrc[1] = pColorSample[2]; - blendSrc[2] = pColorSample[4]; - blendSrc[3] = pColorSample[6]; - } - else - { - pColorSample = nullptr; - } - - SWR_BLEND_CONTEXT blendContext = {0}; - { - // pfnBlendFunc may not update all channels. Initialize with PS output. - /// TODO: move this into the blend JIT. - blendOut = psContext.shaded[rt]; - - blendContext.pBlendState = pBlendState; - blendContext.src = &psContext.shaded[rt]; - blendContext.src1 = &psContext.shaded[1]; - blendContext.src0alpha = reinterpret_cast<simdvector*>(&psContext.shaded[0].w); - blendContext.sampleNum = sample; - blendContext.pDst = &blendSrc; - blendContext.result = &blendOut; - blendContext.oMask = &psContext.oMask; - blendContext.pMask = reinterpret_cast<simdscalari*>(&coverageMask); - - // Blend outputs and update coverage mask for alpha test - if (pfnBlendFunc[rt] != nullptr) - { - pfnBlendFunc[rt](&blendContext); - } - } - - // Track alpha events - AR_EVENT( - AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended)); - - // final write mask - simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); - - ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. - static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, - "Unsupported hot tile format"); - - // store with color mask - if (!pRTBlend->writeDisableRed) - { - _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[0]), outputMask, blendOut.x); - } - if (!pRTBlend->writeDisableGreen) - { - _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[2]), outputMask, blendOut.y); - } - if (!pRTBlend->writeDisableBlue) - { - _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[4]), outputMask, blendOut.z); - } - if (!pRTBlend->writeDisableAlpha) - { - _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[6]), outputMask, blendOut.w); - } - } -} - -template <typename T> -void BackendPixelRate(DRAW_CONTEXT* pDC, - uint32_t workerId, - uint32_t x, - uint32_t y, - SWR_TRIANGLE_DESC& work, - RenderOutputBuffers& renderBuffers) -{ - ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the - /// backend - - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelRateBackend, pDC->drawId); - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - - BarycentricCoeffs coeffs; - SetupBarycentricCoeffs(&coeffs, work); - - SWR_CONTEXT* pContext = pDC->pContext; - void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; - - SWR_PS_CONTEXT psContext; - const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; - SetupPixelShaderContext<T>(&psContext, samplePos, work); - - uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, - &pDepthBuffer, - &pStencilBuffer, - state.colorHottileEnable, - renderBuffers); - - bool isTileDirty = false; - - RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0); - - PixelRateZTestLoop<T> PixelRateZTest(pDC, - workerId, - work, - coeffs, - state, - pDepthBuffer, - pStencilBuffer, - state.backendState.clipDistanceMask); - - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); - - const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); - - for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) - { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); - - const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); - - for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) - { - const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); - - - simdscalar activeLanes; - if (!(work.anyCoveredSamples & MASK)) - { - goto Endtile; - }; - activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK); - - if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) - { - const uint64_t* pCoverageMask = - (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - ? &work.innerCoverageMask - : &work.coverageMask[0]; - - generateInputCoverage<T, T::InputCoverage>( - pCoverageMask, psContext.inputMask, state.blendState.sampleMask); - } - - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId); - - CalcPixelBarycentrics(coeffs, psContext); - - CalcCentroid<T, false>( - &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); - - RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0); - - if (T::bForcedSampleCount) - { - // candidate pixels (that passed coverage) will cause shader invocation if any bits - // in the samplemask are set - const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32( - _simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si())); - activeLanes = _simd_and_ps(activeLanes, vSampleMask); - } - - // Early-Z? - if (T::bCanEarlyZ && !T::bForcedSampleCount) - { - uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest); - UPDATE_STAT_BE(DepthPassCount, depthPassCount); - AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); - } - - // if we have no covered samples that passed depth at this point, go to next tile - if (!_simd_movemask_ps(activeLanes)) - { - goto Endtile; - }; - - if (state.psState.usesSourceDepth) - { - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId); - // interpolate and quantize z - psContext.vZ = vplaneps( - coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); - psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0); - } - - // pixels that are currently active - psContext.activeMask = _simd_castps_si(activeLanes); - psContext.oMask = T::MultisampleT::FullSampleMask(); - - // execute pixel shader - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId); - state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext); - RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0); - - // update stats - UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes))); - AR_EVENT(PSStats((HANDLE)&psContext.stats)); - - // update active lanes to remove any discarded or oMask'd pixels - activeLanes = _simd_castsi_ps(_simd_and_si( - psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); - if (!_simd_movemask_ps(activeLanes)) - { - goto Endtile; - }; - - isTileDirty = true; - - // late-Z - if (!T::bCanEarlyZ && !T::bForcedSampleCount) - { - uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest); - UPDATE_STAT_BE(DepthPassCount, depthPassCount); - AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); - } - - // if we have no covered samples that passed depth at this point, skip OM and go to next - // tile - if (!_simd_movemask_ps(activeLanes)) - { - goto Endtile; - }; - - // output merger - // loop over all samples, broadcasting the results of the PS to all passing pixels - for (uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); - sample++) - { - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId); - // center pattern does a single coverage/depth/stencil test, standard pattern tests - // all samples - uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample; - simdscalar coverageMask, depthMask; - if (T::bForcedSampleCount) - { - coverageMask = depthMask = activeLanes; - } - else - { - coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum]; - depthMask = PixelRateZTest.depthPassMask[coverageSampleNum]; - if (!_simd_movemask_ps(depthMask)) - { - // stencil should already have been written in early/lateZ tests - RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0); - continue; - } - } - - // broadcast the results of the PS to all passing pixels - - OutputMerger8x2(pDC, - psContext, - psContext.pColorBuffer, - sample, - &state.blendState, - state.pfnBlendFunc, - coverageMask, - depthMask, - state.psState.renderTargetMask, - useAlternateOffset, - workerId); - - - if (!state.psState.forceEarlyZ && !T::bForcedSampleCount) - { - uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], - &state.depthStencilState, - work.triFlags.frontFacing, - PixelRateZTest.vZ[coverageSampleNum], - pDepthSample, - depthMask, - coverageMask, - pStencilSample, - PixelRateZTest.stencilPassMask[coverageSampleNum]); - } - RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0); - } - Endtile: - RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId); - - for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) - { - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - - if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - { - work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - - if (useAlternateOffset) - { - unsigned long rt; - uint32_t rtMask = state.colorHottileEnable; - while (_BitScanForward(&rt, rtMask)) - { - rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += - (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; - } - } - - pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; - pStencilBuffer += - (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; - - RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0); - - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); - psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); - } - - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); - psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); - } - - if (isTileDirty) - { - SetRenderHotTilesDirty(pDC, renderBuffers); - } - - RDTSC_END(pDC->pContext->pBucketMgr, BEPixelRateBackend, 0); -} - -template <uint32_t sampleCountT = SWR_MULTISAMPLE_1X, - uint32_t isCenter = 0, - uint32_t coverage = 0, - uint32_t centroid = 0, - uint32_t forced = 0, - uint32_t canEarlyZ = 0 - > -struct SwrBackendTraits -{ - static const bool bIsCenterPattern = (isCenter == 1); - static const uint32_t InputCoverage = coverage; - static const bool bCentroidPos = (centroid == 1); - static const bool bForcedSampleCount = (forced == 1); - static const bool bCanEarlyZ = (canEarlyZ == 1); - typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT; -}; |