summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/swr/rasterizer/core/binner.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer/core/binner.cpp')
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/binner.cpp283
1 files changed, 148 insertions, 135 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 29d2f1ce42c..34789cf0356 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -88,7 +88,7 @@ INLINE void ProcessAttributes(
inputSlot = backendState.vertexAttribOffset + i;
}
- __m128 attrib[3]; // triangle attribs (always 4 wide)
+ simd4scalar attrib[3]; // triangle attribs (always 4 wide)
float* pAttribStart = pBuffer;
if (HasConstantInterpT::value || IsDegenerate::value)
@@ -128,7 +128,7 @@ INLINE void ProcessAttributes(
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
- _mm_store_ps(pBuffer, attrib[vid]);
+ SIMD128::store_ps(pBuffer, attrib[vid]);
pBuffer += 4;
}
}
@@ -138,7 +138,7 @@ INLINE void ProcessAttributes(
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
- _mm_store_ps(pBuffer, attrib[i]);
+ SIMD128::store_ps(pBuffer, attrib[i]);
pBuffer += 4;
}
}
@@ -149,7 +149,7 @@ INLINE void ProcessAttributes(
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
- _mm_store_ps(pBuffer, attrib[i]);
+ SIMD128::store_ps(pBuffer, attrib[i]);
pBuffer += 4;
}
}
@@ -160,7 +160,7 @@ INLINE void ProcessAttributes(
// effect of the missing vertices in the triangle interpolation.
for (uint32_t v = NumVertsT::value; v < 3; ++v)
{
- _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
+ SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
pBuffer += 4;
}
@@ -279,8 +279,7 @@ struct GatherScissors_simd16<16>
{
static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
simd16scalari &scisXmin, simd16scalari &scisYmin,
- simd16scalari &scisXmax, simd16scalari &scisYmax)
- {
+ simd16scalari &scisXmax, simd16scalari &scisYmax) {
scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
pScissorsInFixedPoint[pViewportIndex[1]].xmin,
pScissorsInFixedPoint[pViewportIndex[2]].xmin,
@@ -390,14 +389,14 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask,
uint32_t clipAttribSlot = clipSlot == 0 ?
VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
- __m128 primClipDist[3];
+ simd4scalar primClipDist[3];
pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
float vertClipDist[NumVerts];
for (uint32_t e = 0; e < NumVerts; ++e)
{
OSALIGNSIMD(float) aVertClipDist[4];
- _mm_store_ps(aVertClipDist, primClipDist[e]);
+ SIMD128::store_ps(aVertClipDist, primClipDist[e]);
vertClipDist[e] = aVertClipDist[clipComp];
};
@@ -625,13 +624,14 @@ void BinTriangles(
(SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
}
+ simdBBox bbox;
+
if (!triMask)
{
goto endBinTriangles;
}
// Calc bounding box of triangles
- simdBBox bbox;
calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
// determine if triangle falls between pixel centers and discard
@@ -673,28 +673,30 @@ void BinTriangles(
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- // Make triangle bbox inclusive
- bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
- bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
+ // Make triangle bbox inclusive
+ bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
+ bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
- bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
+ bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+ }
if (CT::IsConservativeT::value)
{
@@ -768,7 +770,7 @@ endBinTriangles:
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+ simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
@@ -837,10 +839,10 @@ endBinTriangles:
// store triangle vertex data
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
// store user clip distances
if (rastState.clipDistanceMask)
@@ -870,7 +872,7 @@ endBinTriangles:
#if USE_SIMD16_FRONTEND
template <typename CT>
-void SIMDAPI BinTriangles_simd16(
+void SIMDCALL BinTriangles_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,
@@ -1124,29 +1126,31 @@ void SIMDAPI BinTriangles_simd16(
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- // Make triangle bbox inclusive
- bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
- bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
+ // Make triangle bbox inclusive
+ bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
+ bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
- bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+ bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
+ bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+ }
if (CT::IsConservativeT::value)
{
@@ -1221,10 +1225,10 @@ endBinTriangles:
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
@@ -1547,24 +1551,26 @@ void BinPostSetupPoints(
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ }
// Cull bloated points completely outside scissor
simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
@@ -1934,24 +1940,26 @@ void BinPostSetupPoints_simd16(
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
{
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
- {
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ }
// Cull bloated points completely outside scissor
simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
@@ -2071,7 +2079,7 @@ void BinPostSetupPoints_simd16(
AR_END(FEBinPoints, 1);
}
-void SIMDAPI BinPoints_simd16(
+void SIMDCALL BinPoints_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,
@@ -2168,6 +2176,8 @@ void BinPostSetupLines(
simdscalar& vRecipW0 = recipW[0];
simdscalar& vRecipW1 = recipW[1];
+ simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+
// convert to fixed point
simdscalari vXi[2], vYi[2];
vXi[0] = fpToFixedPointVertical(prim[0].x);
@@ -2214,24 +2224,26 @@ void BinPostSetupLines(
bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ }
// Cull prims completely outside scissor
{
@@ -2261,7 +2273,6 @@ void BinPostSetupLines(
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
@@ -2310,10 +2321,10 @@ void BinPostSetupLines(
// store line vertex data
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
// store user clip distances
if (rastState.clipDistanceMask)
@@ -2417,25 +2428,27 @@ void BinPostSetupLines_simd16(
bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ }
// Cull prims completely outside scissor
{
@@ -2468,10 +2481,10 @@ void BinPostSetupLines_simd16(
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
@@ -2650,7 +2663,7 @@ void BinLines(
}
#if USE_SIMD16_FRONTEND
-void SIMDAPI BinLines_simd16(
+void SIMDCALL BinLines_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,