diff options
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer/core/binner.cpp')
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/binner.cpp | 283 |
1 files changed, 148 insertions, 135 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 29d2f1ce42c..34789cf0356 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -88,7 +88,7 @@ INLINE void ProcessAttributes( inputSlot = backendState.vertexAttribOffset + i; } - __m128 attrib[3]; // triangle attribs (always 4 wide) + simd4scalar attrib[3]; // triangle attribs (always 4 wide) float* pAttribStart = pBuffer; if (HasConstantInterpT::value || IsDegenerate::value) @@ -128,7 +128,7 @@ INLINE void ProcessAttributes( for (uint32_t i = 0; i < NumVertsT::value; ++i) { - _mm_store_ps(pBuffer, attrib[vid]); + SIMD128::store_ps(pBuffer, attrib[vid]); pBuffer += 4; } } @@ -138,7 +138,7 @@ INLINE void ProcessAttributes( for (uint32_t i = 0; i < NumVertsT::value; ++i) { - _mm_store_ps(pBuffer, attrib[i]); + SIMD128::store_ps(pBuffer, attrib[i]); pBuffer += 4; } } @@ -149,7 +149,7 @@ INLINE void ProcessAttributes( for (uint32_t i = 0; i < NumVertsT::value; ++i) { - _mm_store_ps(pBuffer, attrib[i]); + SIMD128::store_ps(pBuffer, attrib[i]); pBuffer += 4; } } @@ -160,7 +160,7 @@ INLINE void ProcessAttributes( // effect of the missing vertices in the triangle interpolation. for (uint32_t v = NumVertsT::value; v < 3; ++v) { - _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]); + SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]); pBuffer += 4; } @@ -279,8 +279,7 @@ struct GatherScissors_simd16<16> { static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, simd16scalari &scisXmin, simd16scalari &scisYmin, - simd16scalari &scisXmax, simd16scalari &scisYmax) - { + simd16scalari &scisXmax, simd16scalari &scisYmax) { scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin, pScissorsInFixedPoint[pViewportIndex[1]].xmin, pScissorsInFixedPoint[pViewportIndex[2]].xmin, @@ -390,14 +389,14 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, uint32_t clipAttribSlot = clipSlot == 0 ? VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT; - __m128 primClipDist[3]; + simd4scalar primClipDist[3]; pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); float vertClipDist[NumVerts]; for (uint32_t e = 0; e < NumVerts; ++e) { OSALIGNSIMD(float) aVertClipDist[4]; - _mm_store_ps(aVertClipDist, primClipDist[e]); + SIMD128::store_ps(aVertClipDist, primClipDist[e]); vertClipDist[e] = aVertClipDist[clipComp]; }; @@ -625,13 +624,14 @@ void BinTriangles( (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false)); } + simdBBox bbox; + if (!triMask) { goto endBinTriangles; } // Calc bounding box of triangles - simdBBox bbox; calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox); // determine if triangle falls between pixel centers and discard @@ -673,28 +673,30 @@ void BinTriangles( // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.backendState.readViewportArrayIndex) - { - GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. { - scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } + simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.backendState.readViewportArrayIndex) + { + GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } - // Make triangle bbox inclusive - bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)); - bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)); + // Make triangle bbox inclusive + bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)); + bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)); - bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax); - bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax); + bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax); + bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax); + } if (CT::IsConservativeT::value) { @@ -768,7 +770,7 @@ endBinTriangles: // transpose verts needed for backend /// @todo modify BE to take non-transformed verts - __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; + simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x); vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y); vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z); @@ -837,10 +839,10 @@ endBinTriangles: // store triangle vertex data desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); - _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); - _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); - _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); + SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); + SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); + SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); + SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); // store user clip distances if (rastState.clipDistanceMask) @@ -870,7 +872,7 @@ endBinTriangles: #if USE_SIMD16_FRONTEND template <typename CT> -void SIMDAPI BinTriangles_simd16( +void SIMDCALL BinTriangles_simd16( DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, @@ -1124,29 +1126,31 @@ void SIMDAPI BinTriangles_simd16( // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; - - if (state.backendState.readViewportArrayIndex) - { - GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. { - scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } + simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; + + if (state.backendState.readViewportArrayIndex) + { + GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } - // Make triangle bbox inclusive - bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)); - bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)); + // Make triangle bbox inclusive + bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)); + bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)); - bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax); - bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax); + bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax); + bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax); + } if (CT::IsConservativeT::value) { @@ -1221,10 +1225,10 @@ endBinTriangles: // transpose verts needed for backend /// @todo modify BE to take non-transformed verts - __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0)); vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0)); @@ -1547,24 +1551,26 @@ void BinPostSetupPoints( // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.backendState.readViewportArrayIndex) - { - GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. { - scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } + simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.backendState.readViewportArrayIndex) + { + GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } - bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); + bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); + bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); + } // Cull bloated points completely outside scissor simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); @@ -1934,24 +1940,26 @@ void BinPostSetupPoints_simd16( // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.backendState.readViewportArrayIndex) { - GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. - { - scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } + simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.backendState.readViewportArrayIndex) + { + GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } - bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax); - bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax); + bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax); + bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax); + } // Cull bloated points completely outside scissor simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax); @@ -2071,7 +2079,7 @@ void BinPostSetupPoints_simd16( AR_END(FEBinPoints, 1); } -void SIMDAPI BinPoints_simd16( +void SIMDCALL BinPoints_simd16( DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, @@ -2168,6 +2176,8 @@ void BinPostSetupLines( simdscalar& vRecipW0 = recipW[0]; simdscalar& vRecipW1 = recipW[1]; + simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; + // convert to fixed point simdscalari vXi[2], vYi[2]; vXi[0] = fpToFixedPointVertical(prim[0].x); @@ -2214,24 +2224,26 @@ void BinPostSetupLines( bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.backendState.readViewportArrayIndex) - { - GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. { - scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } + simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.backendState.readViewportArrayIndex) + { + GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } - bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); + bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); + bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); + } // Cull prims completely outside scissor { @@ -2261,7 +2273,6 @@ void BinPostSetupLines( // transpose verts needed for backend /// @todo modify BE to take non-transformed verts - __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused); vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused); vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused); @@ -2310,10 +2321,10 @@ void BinPostSetupLines( // store line vertex data desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); - _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); - _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); - _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); + SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); + SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); + SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); + SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); // store user clip distances if (rastState.clipDistanceMask) @@ -2417,25 +2428,27 @@ void BinPostSetupLines_simd16( bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; - - if (state.backendState.readViewportArrayIndex) - { - GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. { - scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } + simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; + + if (state.backendState.readViewportArrayIndex) + { + GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } - bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax); - bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax); + bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax); + bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax); + } // Cull prims completely outside scissor { @@ -2468,10 +2481,10 @@ void BinPostSetupLines_simd16( // transpose verts needed for backend /// @todo modify BE to take non-transformed verts - __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused); vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused); @@ -2650,7 +2663,7 @@ void BinLines( } #if USE_SIMD16_FRONTEND -void SIMDAPI BinLines_simd16( +void SIMDCALL BinLines_simd16( DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, |