summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorTim Rowley <timothy.o.rowley@intel.com>2017-02-15 13:45:16 -0800
committerTim Rowley <timothy.o.rowley@intel.com>2017-03-20 18:04:53 -0500
commitd2759c1eb3b77e9d86c52f2f8e6471a8f339228d (patch)
tree38793cbf1e9ca417ab5ac5a2b424363994c07641 /src
parent2c820d22cf703692b5de66289a0c81d7555aa9d6 (diff)
swr: [rasterizer core/scripts] Autogen backend initialization function(s)
Autogen functions that instantiates different BackendPixelRate templates. Functions get split into separate files after reaching a user defined threshold (currently 512 per file) to speed up compilation. This change will enable the addition of more template flags in the pixel back end. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
Diffstat (limited to 'src')
-rw-r--r--src/gallium/drivers/swr/.gitignore1
-rw-r--r--src/gallium/drivers/swr/Makefile.am22
-rw-r--r--src/gallium/drivers/swr/SConscript13
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend.cpp226
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend.h199
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py125
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp38
7 files changed, 398 insertions, 226 deletions
diff --git a/src/gallium/drivers/swr/.gitignore b/src/gallium/drivers/swr/.gitignore
index 172f3bfbcb7..b6c5faa110a 100644
--- a/src/gallium/drivers/swr/.gitignore
+++ b/src/gallium/drivers/swr/.gitignore
@@ -10,3 +10,4 @@ rasterizer/jitter/builder_x86.h
rasterizer/jitter/state_llvm.h
rasterizer/scripts/gen_knobs.cpp
rasterizer/scripts/gen_knobs.h
+rasterizer/core/BackendPixelRate0.cpp
diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
index b22ded0a191..c67eadc05df 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -61,7 +61,8 @@ BUILT_SOURCES = \
rasterizer/archrast/gen_ar_event.h \
rasterizer/archrast/gen_ar_event.cpp \
rasterizer/archrast/gen_ar_eventhandler.h \
- rasterizer/archrast/gen_ar_eventhandlerfile.h
+ rasterizer/archrast/gen_ar_eventhandlerfile.h \
+ rasterizer/core/BackendPixelRate0.cpp
MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
@@ -156,6 +157,21 @@ rasterizer/archrast/gen_ar_eventhandlerfile.h: rasterizer/scripts/gen_archrast.p
--output rasterizer/archrast/gen_ar_eventhandlerfile.h \
--gen_eventhandlerfile_h
+# 5 SWR_MULTISAMPLE_TYPE_COUNT
+# 2 SWR_MSAA_SAMPLE_PATTERN_COUNT
+# 3 SWR_INPUT_COVERAGE_COUNT
+# 2 centroid
+# 2 forcedSampleCount
+# 2 canEarlyZ
+rasterizer/core/BackendPixelRate0.cpp: rasterizer/scripts/gen_backends.py rasterizer/scripts/templates/backend_template.cpp
+ $(MKDIR_GEN)
+ $(PYTHON_GEN) \
+ $(srcdir)/rasterizer/scripts/gen_backends.py \
+ --outdir rasterizer/core \
+ --dim 5 2 3 2 2 2 \
+ --split 0 \
+ --cpp
+
COMMON_LIBADD = \
$(top_builddir)/src/gallium/auxiliary/libgallium.la \
$(top_builddir)/src/mesa/libmesagallium.la \
@@ -250,6 +266,7 @@ EXTRA_DIST = \
rasterizer/jitter/scripts/gen_llvm_ir_macros.py \
rasterizer/jitter/scripts/gen_llvm_types.py \
rasterizer/scripts/gen_archrast.py \
+ rasterizer/scripts/gen_backends.py \
rasterizer/scripts/gen_knobs.py \
rasterizer/scripts/knob_defs.py \
rasterizer/scripts/mako/ast.py \
@@ -273,4 +290,5 @@ EXTRA_DIST = \
rasterizer/scripts/templates/ar_event_h.template \
rasterizer/scripts/templates/ar_event_cpp.template \
rasterizer/scripts/templates/ar_eventhandler_h.template \
- rasterizer/scripts/templates/ar_eventhandlerfile_h.template
+ rasterizer/scripts/templates/ar_eventhandlerfile_h.template \
+ rasterizer/scripts/templates/backend_template.cpp
diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript
index c0677afc97f..dafeb9229d7 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -132,12 +132,25 @@ env.CodeGenerate(
command = python_cmd + ' $SCRIPT --proto $SOURCE --output $TARGET --gen_eventhandlerfile_h'
)
+# 5 SWR_MULTISAMPLE_TYPE_COUNT
+# 2 SWR_MSAA_SAMPLE_PATTERN_COUNT
+# 3 SWR_INPUT_COVERAGE_COUNT
+# 2 centroid
+# 2 forcedSampleCount
+# 2 canEarlyZ
+env.CodeGenerate(
+ target = 'rasterizer/core/BackendPixelRate0.cpp',
+ script = swrroot + 'rasterizer/scripts/gen_backends.py',
+ command = python_cmd + ' $SCRIPT --output rasterizer/core --dim 5 2 3 2 2 2 --split 0 --cpp'
+)
+
# Auto-generated .cpp files (that need to generate object files)
built_sources = [
'rasterizer/scripts/gen_knobs.cpp',
'rasterizer/jitter/builder_gen.cpp',
'rasterizer/jitter/builder_x86.cpp',
'rasterizer/archrast/gen_ar_event.cpp',
+ 'rasterizer/core/BackendPixelRate0.cpp',
]
source = built_sources
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 3f63b28e71c..032a2206d62 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -30,7 +30,6 @@
#include <smmintrin.h>
#include "backend.h"
-#include "depthstencil.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "core/multisample.h"
@@ -862,203 +861,6 @@ Endtile:
AR_END(BESampleRateBackend, 0);
}
-
-template<typename T>
-void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(BEPixelRateBackend, pDC->drawId);
- AR_BEGIN(BESetup, pDC->drawId);
-
- const API_STATE &state = GetApiState(pDC);
-
- BarycentricCoeffs coeffs;
- SetupBarycentricCoeffs(&coeffs, work);
-
- uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
- SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
- SWR_PS_CONTEXT psContext;
- SetupPixelShaderContext<T>(&psContext, work);
-
- AR_END(BESetup, 0);
-
- PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
-
- psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
- psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
- const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
- for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
- {
- psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
- psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
- const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
- for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
- {
-#if USE_8x2_TILE_BACKEND
- const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-#endif
- simdscalar activeLanes;
- if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
- activeLanes = vMask(work.anyCoveredSamples & MASK);
-
- if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
- {
- const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
- generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
- }
-
- AR_BEGIN(BEBarycentric, pDC->drawId);
-
- CalcPixelBarycentrics(coeffs, psContext);
-
- CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
-
- AR_END(BEBarycentric, 0);
-
- if(T::bForcedSampleCount)
- {
- // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
- const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
- activeLanes = _simd_and_ps(activeLanes, vSampleMask);
- }
-
- // Early-Z?
- if(T::bCanEarlyZ && !T::bForcedSampleCount)
- {
- uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
- UPDATE_STAT_BE(DepthPassCount, depthPassCount);
- AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
- }
-
- // if we have no covered samples that passed depth at this point, go to next tile
- if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
- if(state.psState.usesSourceDepth)
- {
- AR_BEGIN(BEBarycentric, pDC->drawId);
- // interpolate and quantize z
- psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
- psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
- AR_END(BEBarycentric, 0);
- }
-
- // pixels that are currently active
- psContext.activeMask = _simd_castps_si(activeLanes);
- psContext.oMask = T::MultisampleT::FullSampleMask();
-
- // execute pixel shader
- AR_BEGIN(BEPixelShader, pDC->drawId);
- state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
- UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
- AR_END(BEPixelShader, 0);
-
- // update active lanes to remove any discarded or oMask'd pixels
- activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
- if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
- // late-Z
- if(!T::bCanEarlyZ && !T::bForcedSampleCount)
- {
- uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
- UPDATE_STAT_BE(DepthPassCount, depthPassCount);
- AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
- }
-
- // if we have no covered samples that passed depth at this point, skip OM and go to next tile
- if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
- // output merger
- // loop over all samples, broadcasting the results of the PS to all passing pixels
- for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
- {
- AR_BEGIN(BEOutputMerger, pDC->drawId);
- // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
- uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
- simdscalar coverageMask, depthMask;
- if(T::bForcedSampleCount)
- {
- coverageMask = depthMask = activeLanes;
- }
- else
- {
- coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
- depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
- if(!_simd_movemask_ps(depthMask))
- {
- // stencil should already have been written in early/lateZ tests
- AR_END(BEOutputMerger, 0);
- continue;
- }
- }
-
- // broadcast the results of the PS to all passing pixels
-#if USE_8x2_TILE_BACKEND
- OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
- OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
-#endif
-
- if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
- {
- uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
- uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
- DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
- pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
- }
- AR_END(BEOutputMerger, 0);
- }
-Endtile:
- AR_BEGIN(BEEndTile, pDC->drawId);
-
- for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
- {
- work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
- }
-
- if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
- {
- work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
- }
- work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-
-#if USE_8x2_TILE_BACKEND
- if (useAlternateOffset)
- {
- for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
- {
- pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
- }
- }
-#else
- for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
- {
- pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
- }
- pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
- pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-#endif
-
- AR_END(BEEndTile, 0);
-
- psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
- psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
- }
-
- psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
- psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
- }
-
- AR_END(BEPixelRateBackend, 0);
-}
// optimized backend flow with NULL PS
template<uint32_t sampleCountT>
void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
@@ -1302,31 +1104,6 @@ void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COU
}
}
-void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])
-{
- for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
- {
- for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
- {
- for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
- {
- for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
- {
- for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
- {
- for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
- {
- table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
- BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage,
- (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
- }
- }
- }
- }
- }
- }
-}
-
void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
{
for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
@@ -1346,10 +1123,11 @@ void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_C
}
}
+void InitBackendPixelRate0();
void InitBackendFuncTables()
{
InitBackendSingleFuncTable(gBackendSingleSample);
- InitBackendPixelFuncTable(gBackendPixelRateTable);
+ InitBackendPixelRate0();
InitBackendSampleFuncTable(gBackendSampleRateTable);
gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 80ee1defdad..c3585cc930c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -31,6 +31,7 @@
#include "common/os.h"
#include "core/context.h"
#include "core/multisample.h"
+#include "depthstencil.h"
#include "rdtsc_core.h"
void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
@@ -835,6 +836,204 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW
}
#endif
+
+template<typename T>
+void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ AR_BEGIN(BEPixelRateBackend, pDC->drawId);
+ AR_BEGIN(BESetup, pDC->drawId);
+
+ const API_STATE &state = GetApiState(pDC);
+
+ BarycentricCoeffs coeffs;
+ SetupBarycentricCoeffs(&coeffs, work);
+
+ uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
+ SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+
+ SWR_PS_CONTEXT psContext;
+ SetupPixelShaderContext<T>(&psContext, work);
+
+ AR_END(BESetup, 0);
+
+ PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
+
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+ const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+ for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+ {
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+ const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+ for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+ {
+#if USE_8x2_TILE_BACKEND
+ const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+
+#endif
+ simdscalar activeLanes;
+ if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+ activeLanes = vMask(work.anyCoveredSamples & MASK);
+
+ if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+ {
+ const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+ generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+ }
+
+ AR_BEGIN(BEBarycentric, pDC->drawId);
+
+ CalcPixelBarycentrics(coeffs, psContext);
+
+ CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+ AR_END(BEBarycentric, 0);
+
+ if(T::bForcedSampleCount)
+ {
+ // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+ const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
+ activeLanes = _simd_and_ps(activeLanes, vSampleMask);
+ }
+
+ // Early-Z?
+ if(T::bCanEarlyZ && !T::bForcedSampleCount)
+ {
+ uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
+ UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+ AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+ }
+
+ // if we have no covered samples that passed depth at this point, go to next tile
+ if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+ if(state.psState.usesSourceDepth)
+ {
+ AR_BEGIN(BEBarycentric, pDC->drawId);
+ // interpolate and quantize z
+ psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+ psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+ AR_END(BEBarycentric, 0);
+ }
+
+ // pixels that are currently active
+ psContext.activeMask = _simd_castps_si(activeLanes);
+ psContext.oMask = T::MultisampleT::FullSampleMask();
+
+ // execute pixel shader
+ AR_BEGIN(BEPixelShader, pDC->drawId);
+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+ UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+ AR_END(BEPixelShader, 0);
+
+ // update active lanes to remove any discarded or oMask'd pixels
+ activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
+ if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+ // late-Z
+ if(!T::bCanEarlyZ && !T::bForcedSampleCount)
+ {
+ uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
+ UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+ AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+ }
+
+ // if we have no covered samples that passed depth at this point, skip OM and go to next tile
+ if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+ // output merger
+ // loop over all samples, broadcasting the results of the PS to all passing pixels
+ for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
+ {
+ AR_BEGIN(BEOutputMerger, pDC->drawId);
+ // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
+ uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
+ simdscalar coverageMask, depthMask;
+ if(T::bForcedSampleCount)
+ {
+ coverageMask = depthMask = activeLanes;
+ }
+ else
+ {
+ coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
+ depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
+ if(!_simd_movemask_ps(depthMask))
+ {
+ // stencil should already have been written in early/lateZ tests
+ AR_END(BEOutputMerger, 0);
+ continue;
+ }
+ }
+
+ // broadcast the results of the PS to all passing pixels
+#if USE_8x2_TILE_BACKEND
+ OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+#else
+ OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
+#endif
+
+ if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
+ {
+ uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+ uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+ pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
+ }
+ AR_END(BEOutputMerger, 0);
+ }
+Endtile:
+ AR_BEGIN(BEEndTile, pDC->drawId);
+
+ for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+ {
+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ }
+
+ if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+ {
+ work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+ }
+ work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+
+#if USE_8x2_TILE_BACKEND
+ if (useAlternateOffset)
+ {
+ for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+ {
+ pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+ }
+#else
+ for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+ {
+ pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+ }
+ pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+ pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+#endif
+
+ AR_END(BEEndTile, 0);
+
+ psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
+ psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+ }
+
+ psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
+ psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+ }
+
+ AR_END(BEPixelRateBackend, 0);
+}
+
template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0>
struct SwrBackendTraits
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py
new file mode 100644
index 00000000000..cbbc3780a68
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py
@@ -0,0 +1,125 @@
+# Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+# Python source
+# Compatible with Python2.X and Python3.X
+
+from __future__ import print_function
+import itertools
+import math
+import argparse
+import os
+import sys
+from mako.template import Template
+from mako.exceptions import RichTraceback
+
+def write_template_to_string(template_filename, **kwargs):
+ try:
+ template = Template(filename=os.path.abspath(template_filename))
+ # Split + Join fixes line-endings for whatever platform you are using
+ return '\n'.join(template.render(**kwargs).splitlines())
+ except:
+ traceback = RichTraceback()
+ for (filename, lineno, function, line) in traceback.traceback:
+ print("File %s, line %s, in %s" % (filename, lineno, function))
+ print(line, "\n")
+ print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error))
+
+def write_template_to_file(template_filename, output_filename, **kwargs):
+ output_dirname = os.path.dirname(output_filename)
+ if not os.path.exists(output_dirname):
+ os.makedirs(output_dirname)
+ with open(output_filename, "w") as outfile:
+ print(write_template_to_string(template_filename, **kwargs), file=outfile)
+
+
+def main(args=sys.argv[1:]):
+ thisDir = os.path.dirname(os.path.realpath(__file__))
+ parser = argparse.ArgumentParser("Generate files and initialization functions for all permutuations of BackendPixelRate.")
+ parser.add_argument('--dim', help="gBackendPixelRateTable array dimensions", nargs='+', type=int, required=True)
+ parser.add_argument('--outdir', help="output directory", nargs='?', type=str, default=thisDir)
+ parser.add_argument('--split', help="how many lines of initialization per file [0=no split]", nargs='?', type=int, default='512')
+ parser.add_argument('--cpp', help="Generate cpp file(s)", action='store_true', default=False)
+ parser.add_argument('--cmake', help="Generate cmake file", action='store_true', default=False)
+
+
+ args = parser.parse_args(args);
+
+ output_list = []
+ for x in args.dim:
+ output_list.append(list(range(x)))
+
+ # generate all permutations possible for template paremeter inputs
+ output_combinations = list(itertools.product(*output_list))
+ output_list = []
+
+ # for each permutation
+ for x in range(len(output_combinations)):
+ # separate each template peram into its own list member
+ new_list = [output_combinations[x][i] for i in range(len(output_combinations[x]))]
+ tempStr = 'gBackendPixelRateTable'
+ #print each list member as an index in the multidimensional array
+ for i in new_list:
+ tempStr += '[' + str(i) + ']'
+ #map each entry in the permuation as its own string member, store as the template instantiation string
+ tempStr += " = BackendPixelRate<SwrBackendTraits<" + ','.join(map(str, output_combinations[x])) + '>>;'
+ #append the line of c++ code in the list of output lines
+ output_list.append(tempStr)
+
+ # how many files should we split the global template initialization into?
+ if (args.split == 0):
+ numFiles = 1
+ else:
+ numFiles = (len(output_list) + args.split - 1) // args.split
+ linesPerFile = (len(output_list) + numFiles - 1) // numFiles
+ chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)]
+
+ # generate .cpp files
+ if args.cpp:
+ baseCppName = os.path.join(args.outdir, 'BackendPixelRate%s.cpp')
+ templateCpp = os.path.join(thisDir, 'templates', 'backend_template.cpp')
+
+ for fileNum in range(numFiles):
+ filename = baseCppName % str(fileNum)
+ print('Generating', filename)
+ write_template_to_file(
+ templateCpp,
+ baseCppName % str(fileNum),
+ fileNum=fileNum,
+ funcList=chunkedList[fileNum])
+
+ # generate gen_backend.cmake file
+ if args.cmake:
+ templateCmake = os.path.join(thisDir, 'templates', 'backend_template.cmake')
+ cmakeFile = os.path.join(args.outdir, 'gen_backends.cmake')
+ print('Generating', cmakeFile)
+ write_template_to_file(
+ templateCmake,
+ cmakeFile,
+ numFiles=numFiles,
+ baseCppName=baseCppName.replace('\\','/'))
+
+ print("Generated %d template instantiations in %d files" % (len(output_list), numFiles))
+
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp b/src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp
new file mode 100644
index 00000000000..f015f5f179c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp
@@ -0,0 +1,38 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file BackendPixelRate${fileNum}.cpp
+*
+* @brief auto-generated file
+*
+* DO NOT EDIT
+*
+******************************************************************************/
+
+#include "core/backend.h"
+
+void InitBackendPixelRate${fileNum}()
+{
+ %for func in funcList:
+ ${func}
+ %endfor
+}