summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/swr/rasterizer/core/pa.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer/core/pa.h')
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/pa.h1676
1 files changed, 0 insertions, 1676 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
deleted file mode 100644
index adfc1414bae..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ /dev/null
@@ -1,1676 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file pa.h
- *
- * @brief Definitions for primitive assembly.
- * N primitives are assembled at a time, where N is the SIMD width.
- * A state machine, that is specific for a given topology, drives the
- * assembly of vertices into triangles.
- *
- ******************************************************************************/
-#pragma once
-
-#include "frontend.h"
-
-struct PA_STATE
-{
-#if USE_SIMD16_FRONTEND
- enum
- {
- SIMD_WIDTH = KNOB_SIMD16_WIDTH,
- SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
- SIMD_WIDTH_LOG2 = 4
- };
-
- typedef simd16mask SIMDMASK;
-
- typedef simd16scalar SIMDSCALAR;
- typedef simd16vector SIMDVECTOR;
- typedef simd16vertex SIMDVERTEX;
-
- typedef simd16scalari SIMDSCALARI;
-
-#else
- enum
- {
- SIMD_WIDTH = KNOB_SIMD_WIDTH,
- SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
- SIMD_WIDTH_LOG2 = 3
- };
-
- typedef simdmask SIMDMASK;
-
- typedef simdscalar SIMDSCALAR;
- typedef simdvector SIMDVECTOR;
- typedef simdvertex SIMDVERTEX;
-
- typedef simdscalari SIMDSCALARI;
-
-#endif
- DRAW_CONTEXT* pDC{nullptr}; // draw context
- uint8_t* pStreamBase{nullptr}; // vertex stream
- uint32_t streamSizeInVerts{0}; // total size of the input stream in verts
- uint32_t vertexStride{0}; // stride of a vertex in simdvector units
-
- // The topology the binner will use. In some cases the FE changes the topology from the api
- // state.
- PRIMITIVE_TOPOLOGY binTopology{TOP_UNKNOWN};
-
-#if ENABLE_AVX512_SIMD16
- bool useAlternateOffset{false};
-#endif
-
- bool viewportArrayActive{false};
- bool rtArrayActive{false};
- uint32_t numVertsPerPrim{0};
-
- PA_STATE() {}
- PA_STATE(DRAW_CONTEXT* in_pDC,
- uint8_t* in_pStreamBase,
- uint32_t in_streamSizeInVerts,
- uint32_t in_vertexStride,
- uint32_t in_numVertsPerPrim) :
- pDC(in_pDC),
- pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts),
- vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim)
- {
- }
-
- virtual bool HasWork() = 0;
- virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
-#if ENABLE_AVX512_SIMD16
- virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
-#endif
- virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
-#if ENABLE_AVX512_SIMD16
- virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
-#endif
- virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
- virtual bool NextPrim() = 0;
- virtual SIMDVERTEX& GetNextVsOutput() = 0;
- virtual bool GetNextStreamOutput() = 0;
- virtual SIMDMASK& GetNextVsIndices() = 0;
- virtual uint32_t NumPrims() = 0;
- virtual void Reset() = 0;
- virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
-};
-
-// The Optimized PA is a state machine that assembles triangles from vertex shader simd
-// output. Here is the sequence
-// 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
-// 2. Execute PA function to assemble and bin triangles.
-// a. The PA function is a set of functions that collectively make up the
-// state machine for a given topology.
-// 1. We use a state index to track which PA function to call.
-// b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
-// 1. We call this the current and previous simd vertex.
-// 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
-// order to assemble the second triangle, for a triangle list, we'll need the
-// last vertex from the previous simd and the first 2 vertices from the current
-// simd.
-// 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
-//
-// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
-// cuts
-struct PA_STATE_OPT : public PA_STATE
-{
- uint32_t numPrims{0}; // Total number of primitives for draw.
- uint32_t numPrimsComplete{0}; // Total number of complete primitives.
-
- uint32_t numSimdPrims{0}; // Number of prims in current simd.
-
- uint32_t cur{0}; // index to current VS output.
- uint32_t prev{0}; // index to prev VS output. Not really needed in the state.
- const uint32_t first{0}; // index to first VS output. Used for tri fan and line loop.
-
- uint32_t counter{0}; // state counter
- bool reset{false}; // reset state
-
- uint32_t primIDIncr{0}; // how much to increment for each vector (typically vector / {1, 2})
- SIMDSCALARI primID;
-
- typedef bool (*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-#if ENABLE_AVX512_SIMD16
- typedef bool (*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
-#endif
- typedef void (*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa,
- uint32_t slot,
- uint32_t primIndex,
- simd4scalar verts[]);
-
- PFN_PA_FUNC pfnPaFunc{nullptr}; // PA state machine function for assembling 4 triangles.
-#if ENABLE_AVX512_SIMD16
- PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{nullptr};
-#endif
- PFN_PA_SINGLE_FUNC pfnPaSingleFunc{
- nullptr}; // PA state machine function for assembling single triangle.
- PFN_PA_FUNC pfnPaFuncReset{nullptr}; // initial state to set on reset
-#if ENABLE_AVX512_SIMD16
- PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{nullptr};
-#endif
-
- // state used to advance the PA when Next is called
- PFN_PA_FUNC pfnPaNextFunc{nullptr};
-#if ENABLE_AVX512_SIMD16
- PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{nullptr};
-#endif
- uint32_t nextNumSimdPrims{0};
- uint32_t nextNumPrimsIncrement{0};
- bool nextReset{false};
- bool isStreaming{false};
-
- SIMDMASK junkIndices{0}; // temporary index store for unused virtual function
-
- PA_STATE_OPT() {}
- PA_STATE_OPT(DRAW_CONTEXT* pDC,
- uint32_t numPrims,
- uint8_t* pStream,
- uint32_t streamSizeInVerts,
- uint32_t vertexStride,
- bool in_isStreaming,
- uint32_t numVertsPerPrim,
- PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
-
- bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; }
-
- simdvector& GetSimdVector(uint32_t index, uint32_t slot)
- {
- SWR_ASSERT(slot < vertexStride);
- uint32_t offset = index * vertexStride + slot;
- simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
- return vertexSlot;
- }
-
-#if ENABLE_AVX512_SIMD16
- simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
- {
- SWR_ASSERT(slot < vertexStride);
- uint32_t offset = index * vertexStride + slot;
- simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
- return vertexSlot;
- }
-
-#endif
- // Assembles 4 triangles. Each simdvector is a single vertex from 4
- // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
- bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); }
-
-#if ENABLE_AVX512_SIMD16
- bool Assemble(uint32_t slot, simd16vector verts[])
- {
- return this->pfnPaFunc_simd16(*this, slot, verts);
- }
-
-#endif
- // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
- void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
- {
- return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
- }
-
- bool NextPrim()
- {
- this->pfnPaFunc = this->pfnPaNextFunc;
-#if ENABLE_AVX512_SIMD16
- this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
-#endif
- this->numSimdPrims = this->nextNumSimdPrims;
- this->numPrimsComplete += this->nextNumPrimsIncrement;
- this->reset = this->nextReset;
-
- if (this->isStreaming)
- {
- this->reset = false;
- }
-
- bool morePrims = false;
-
- if (this->numSimdPrims > 0)
- {
- morePrims = true;
- this->numSimdPrims--;
- }
- else
- {
- this->counter = (this->reset) ? 0 : (this->counter + 1);
- this->reset = false;
- }
-
- if (!HasWork())
- {
- morePrims = false; // no more to do
- }
-
- return morePrims;
- }
-
- SIMDVERTEX& GetNextVsOutput()
- {
- const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
-
- // increment cur and prev indices
- if (counter < numSimdVerts)
- {
- // prev undefined for first state
- prev = cur;
- cur = counter;
- }
- else
- {
- // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in
- // the buffer
- uint32_t temp = prev;
-
- prev = cur;
- cur = temp;
- }
-
- SWR_ASSERT(cur < numSimdVerts);
- SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
-
- return *(SIMDVERTEX*)pVertex;
- }
-
- SIMDMASK& GetNextVsIndices()
- {
- // unused in optimized PA, pass tmp buffer back
- return junkIndices;
- }
-
- bool GetNextStreamOutput()
- {
- this->prev = this->cur;
- this->cur = this->counter;
-
- return HasWork();
- }
-
- uint32_t NumPrims()
- {
- return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims)
- ? (SIMD_WIDTH -
- (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims))
- : SIMD_WIDTH;
- }
-
- void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
- PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
- uint32_t numSimdPrims = 0,
- uint32_t numPrimsIncrement = 0,
- bool reset = false)
- {
- this->pfnPaNextFunc = pfnPaNextFunc;
- this->nextNumSimdPrims = numSimdPrims;
- this->nextNumPrimsIncrement = numPrimsIncrement;
- this->nextReset = reset;
-
- this->pfnPaSingleFunc = pfnPaNextSingleFunc;
- }
-
-#if ENABLE_AVX512_SIMD16
- void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
- PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
- PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
- uint32_t numSimdPrims = 0,
- uint32_t numPrimsIncrement = 0,
- bool reset = false)
- {
- this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
- this->pfnPaNextFunc = pfnPaNextFunc;
- this->nextNumSimdPrims = numSimdPrims;
- this->nextNumPrimsIncrement = numPrimsIncrement;
- this->nextReset = reset;
-
- this->pfnPaSingleFunc = pfnPaNextSingleFunc;
- }
-
-#endif
- void Reset()
- {
-#if ENABLE_AVX512_SIMD16
- useAlternateOffset = false;
-
-#endif
- this->pfnPaFunc = this->pfnPaFuncReset;
-#if ENABLE_AVX512_SIMD16
- this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
-#endif
- this->numPrimsComplete = 0;
- this->numSimdPrims = 0;
- this->cur = 0;
- this->prev = 0;
- this->counter = 0;
- this->reset = false;
- }
-
- SIMDSCALARI GetPrimID(uint32_t startID)
- {
-#if USE_SIMD16_FRONTEND
- return _simd16_add_epi32(
- this->primID,
- _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
-#else
- return _simd_add_epi32(
- this->primID,
- _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
-#endif
- }
-};
-
-// helper C wrappers to avoid having to rewrite all the PA topology state functions
-INLINE void SetNextPaState(PA_STATE_OPT& pa,
- PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
- PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
- uint32_t numSimdPrims = 0,
- uint32_t numPrimsIncrement = 0,
- bool reset = false)
-{
- return pa.SetNextState(
- pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
-}
-
-#if ENABLE_AVX512_SIMD16
-INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa,
- PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
- PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
- PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
- uint32_t numSimdPrims = 0,
- uint32_t numPrimsIncrement = 0,
- bool reset = false)
-{
- return pa.SetNextState_simd16(pfnPaNextFunc_simd16,
- pfnPaNextFunc,
- pfnPaNextSingleFunc,
- numSimdPrims,
- numPrimsIncrement,
- reset);
-}
-
-#endif
-INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
-{
- return pa.GetSimdVector(index, slot);
-}
-
-#if ENABLE_AVX512_SIMD16
-INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
-{
- return pa.GetSimdVector_simd16(index, slot);
-}
-
-#endif
-// Cut-aware primitive assembler.
-struct PA_STATE_CUT : public PA_STATE
-{
- SIMDMASK* pCutIndices{nullptr}; // cut indices buffer, 1 bit per vertex
- uint32_t numVerts{0}; // number of vertices available in buffer store
- uint32_t numAttribs{0}; // number of attributes
- int32_t numRemainingVerts{0}; // number of verts remaining to be assembled
- uint32_t numVertsToAssemble{0}; // total number of verts to assemble for the draw
-#if ENABLE_AVX512_SIMD16
- OSALIGNSIMD16(uint32_t)
- indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
-#else
- OSALIGNSIMD(uint32_t)
- indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
-#endif
- SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
- uint32_t numPrimsAssembled{0}; // number of primitives that are fully assembled
- uint32_t headVertex{0}; // current unused vertex slot in vertex buffer store
- uint32_t tailVertex{0}; // beginning vertex currently assembling
- uint32_t curVertex{0}; // current unprocessed vertex
- uint32_t startPrimId{0}; // starting prim id
- SIMDSCALARI vPrimId; // vector of prim ID
- bool needOffsets{false}; // need to compute gather offsets for current SIMD
- uint32_t vertsPerPrim{0};
- bool processCutVerts{
- false}; // vertex indices with cuts should be processed as normal, otherwise they
- // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
- // while the GS sends valid verts for every index
-
- simdvector junkVector; // junk simdvector for unimplemented API
-#if ENABLE_AVX512_SIMD16
- simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
-#endif
-
- // Topology state tracking
- uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
- uint32_t curIndex{0};
- bool reverseWinding{false}; // indicates reverse winding for strips
- int32_t adjExtraVert{0}; // extra vert uses for tristrip w/ adj
-
- typedef void (PA_STATE_CUT::*PFN_PA_FUNC)(uint32_t vert, bool finish);
- PFN_PA_FUNC pfnPa{nullptr}; // per-topology function that processes a single vert
-
- PA_STATE_CUT() {}
- PA_STATE_CUT(DRAW_CONTEXT* pDC,
- uint8_t* in_pStream,
- uint32_t in_streamSizeInVerts,
- uint32_t in_vertexStride,
- SIMDMASK* in_pIndices,
- uint32_t in_numVerts,
- uint32_t in_numAttribs,
- PRIMITIVE_TOPOLOGY topo,
- bool in_processCutVerts,
- uint32_t in_numVertsPerPrim) :
- PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
- {
- numVerts = in_streamSizeInVerts;
- numAttribs = in_numAttribs;
- binTopology = topo;
- needOffsets = false;
- processCutVerts = in_processCutVerts;
-
- numVertsToAssemble = numRemainingVerts = in_numVerts;
- numPrimsAssembled = 0;
- headVertex = tailVertex = curVertex = 0;
-
- curIndex = 0;
- pCutIndices = in_pIndices;
- memset(indices, 0, sizeof(indices));
-#if USE_SIMD16_FRONTEND
- vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-#else
- vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-#endif
- reverseWinding = false;
- adjExtraVert = -1;
-
- bool gsEnabled = pDC->pState->state.gsState.gsEnable;
- vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
-
- switch (topo)
- {
- case TOP_TRIANGLE_LIST:
- pfnPa = &PA_STATE_CUT::ProcessVertTriList;
- break;
- case TOP_TRI_LIST_ADJ:
- pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj
- : &PA_STATE_CUT::ProcessVertTriListAdjNoGs;
- break;
- case TOP_TRIANGLE_STRIP:
- pfnPa = &PA_STATE_CUT::ProcessVertTriStrip;
- break;
- case TOP_TRI_STRIP_ADJ:
- if (gsEnabled)
- {
- pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<true>;
- }
- else
- {
- pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<false>;
- }
- break;
-
- case TOP_POINT_LIST:
- pfnPa = &PA_STATE_CUT::ProcessVertPointList;
- break;
- case TOP_LINE_LIST:
- pfnPa = &PA_STATE_CUT::ProcessVertLineList;
- break;
- case TOP_LINE_LIST_ADJ:
- pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj
- : &PA_STATE_CUT::ProcessVertLineListAdjNoGs;
- break;
- case TOP_LINE_STRIP:
- pfnPa = &PA_STATE_CUT::ProcessVertLineStrip;
- break;
- case TOP_LISTSTRIP_ADJ:
- pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj
- : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs;
- break;
- case TOP_RECT_LIST:
- pfnPa = &PA_STATE_CUT::ProcessVertRectList;
- break;
- default:
- assert(0 && "Unimplemented topology");
- }
- }
-
- SIMDVERTEX& GetNextVsOutput()
- {
- uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
- this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
- this->needOffsets = true;
- SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
-
- return *(SIMDVERTEX*)pVertex;
- }
-
- SIMDMASK& GetNextVsIndices()
- {
- uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
- SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
- return *pCurCutIndex;
- }
-
- simdvector& GetSimdVector(uint32_t index, uint32_t slot)
- {
- // unused
- SWR_ASSERT(0 && "Not implemented");
- return junkVector;
- }
-
-#if ENABLE_AVX512_SIMD16
- simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
- {
- // unused
- SWR_ASSERT(0 && "Not implemented");
- return junkVector_simd16;
- }
-
-#endif
- bool GetNextStreamOutput()
- {
- this->headVertex += SIMD_WIDTH;
- this->needOffsets = true;
- return HasWork();
- }
-
- SIMDSCALARI GetPrimID(uint32_t startID)
- {
-#if USE_SIMD16_FRONTEND
- return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
-#else
- return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
-#endif
- }
-
- void Reset()
- {
-#if ENABLE_AVX512_SIMD16
- useAlternateOffset = false;
-
-#endif
- this->numRemainingVerts = this->numVertsToAssemble;
- this->numPrimsAssembled = 0;
- this->curIndex = 0;
- this->curVertex = 0;
- this->tailVertex = 0;
- this->headVertex = 0;
- this->reverseWinding = false;
- this->adjExtraVert = -1;
-#if USE_SIMD16_FRONTEND
- this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-#else
- this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-#endif
- }
-
- bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; }
-
- bool IsVertexStoreFull()
- {
- return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
- }
-
- void RestartTopology()
- {
- this->curIndex = 0;
- this->reverseWinding = false;
- this->adjExtraVert = -1;
- }
-
- bool IsCutIndex(uint32_t vertex)
- {
- uint32_t vertexIndex = vertex / SIMD_WIDTH;
- uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
- return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
- }
-
- // iterates across the unprocessed verts until we hit the end or we
- // have assembled SIMD prims
- void ProcessVerts()
- {
- while (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0 &&
- this->curVertex != this->headVertex)
- {
- // if cut index, restart topology
- if (IsCutIndex(this->curVertex))
- {
- if (this->processCutVerts)
- {
- (this->*pfnPa)(this->curVertex, false);
- }
- // finish off tri strip w/ adj before restarting topo
- if (this->adjExtraVert != -1)
- {
- (this->*pfnPa)(this->curVertex, true);
- }
- RestartTopology();
- }
- else
- {
- (this->*pfnPa)(this->curVertex, false);
- }
-
- this->curVertex++;
- if (this->curVertex >= this->numVerts)
- {
- this->curVertex = 0;
- }
- this->numRemainingVerts--;
- }
-
- // special case last primitive for tri strip w/ adj
- if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 &&
- this->adjExtraVert != -1)
- {
- (this->*pfnPa)(this->curVertex, true);
- }
- }
-
- void Advance()
- {
- // done with current batch
- // advance tail to the current unsubmitted vertex
- this->tailVertex = this->curVertex;
- this->numPrimsAssembled = 0;
-#if USE_SIMD16_FRONTEND
- this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
-#else
- this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
-#endif
- }
-
- bool NextPrim()
- {
- // if we've assembled enough prims, we can advance to the next set of verts
- if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
- {
- Advance();
- }
- return false;
- }
-
- void ComputeOffsets()
- {
- for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
- {
- uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
- SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
-
- // step to simdvertex batch
- const uint32_t simdShift = SIMD_WIDTH_LOG2;
-#if USE_SIMD16_FRONTEND
- SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
- this->vOffsets[v] =
- _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
-#else
- SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
- this->vOffsets[v] =
- _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
-#endif
-
- // step to index
- const uint32_t simdMask = SIMD_WIDTH - 1;
-#if USE_SIMD16_FRONTEND
- SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
- this->vOffsets[v] = _simd16_add_epi32(
- this->vOffsets[v],
- _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
-#else
- SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
- this->vOffsets[v] =
- _simd_add_epi32(this->vOffsets[v],
- _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
-#endif
- }
- }
-
- bool Assemble(uint32_t slot, simdvector* verts)
- {
- // process any outstanding verts
- ProcessVerts();
-
- // return false if we don't have enough prims assembled
- if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
- {
- return false;
- }
-
- // cache off gather offsets given the current SIMD set of indices the first time we get an
- // assemble
- if (this->needOffsets)
- {
- ComputeOffsets();
- this->needOffsets = false;
- }
-
- for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
- {
- SIMDSCALARI offsets = this->vOffsets[v];
-
- // step to attribute
-#if USE_SIMD16_FRONTEND
- offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
-#else
- offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
-#endif
-
- float* pBase = (float*)this->pStreamBase;
- for (uint32_t c = 0; c < 4; ++c)
- {
-#if USE_SIMD16_FRONTEND
- simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
-
- // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
- simdscalar t =
- useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
- verts[v].v[c] = t;
-#else
- verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
-#endif
-
- // move base to next component
- pBase += SIMD_WIDTH;
- }
- }
-
- // compute the implied 4th vertex, v3
- if (this->binTopology == TOP_RECT_LIST)
- {
- for (uint32_t c = 0; c < 4; ++c)
- {
- // v1, v3 = v1 + v2 - v0, v2
- // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
- simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
- temp = _simd16_sub_ps(temp, verts[1].v[c]);
- temp = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
- verts[1].v[c] = _simd16_extract_ps(temp, 0);
- }
- }
-
- return true;
- }
-
-#if ENABLE_AVX512_SIMD16
- bool Assemble(uint32_t slot, simd16vector verts[])
- {
- // process any outstanding verts
- ProcessVerts();
-
- // return false if we don't have enough prims assembled
- if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
- {
- return false;
- }
-
- // cache off gather offsets given the current SIMD set of indices the first time we get an
- // assemble
- if (this->needOffsets)
- {
- ComputeOffsets();
- this->needOffsets = false;
- }
-
- for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
- {
- SIMDSCALARI offsets = this->vOffsets[v];
-
- // step to attribute
-#if USE_SIMD16_FRONTEND
- offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
-#else
- offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
-#endif
-
- float* pBase = (float*)this->pStreamBase;
- for (uint32_t c = 0; c < 4; ++c)
- {
-#if USE_SIMD16_FRONTEND
- verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
-#else
- verts[v].v[c] = _simd16_insert_ps(
- _simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
-#endif
-
- // move base to next component
- pBase += SIMD_WIDTH;
- }
- }
-
- // compute the implied 4th vertex, v3
- if (this->binTopology == TOP_RECT_LIST)
- {
- for (uint32_t c = 0; c < 4; ++c)
- {
- // v1, v3 = v1 + v2 - v0, v2
- // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
- simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
- temp = _simd16_sub_ps(temp, verts[1].v[c]);
- verts[1].v[c] =
- _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
- }
- }
-
- return true;
- }
-
-#endif
- void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
- {
- // move to slot
- for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
- {
- uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
-#if USE_SIMD16_FRONTEND
- uint32_t offset =
- useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
-#else
- uint32_t offset = pOffset[triIndex];
-#endif
- offset += sizeof(SIMDVECTOR) * slot;
- float* pVert = (float*)&tri[v];
- for (uint32_t c = 0; c < 4; ++c)
- {
- float* pComponent = (float*)(this->pStreamBase + offset);
- pVert[c] = *pComponent;
- offset += SIMD_WIDTH * sizeof(float);
- }
- }
-
- // compute the implied 4th vertex, v3
- if ((this->binTopology == TOP_RECT_LIST) && (triIndex % 2 == 1))
- {
- // v1, v3 = v1 + v2 - v0, v2
- // v1 stored in tri[0], v0 stored in tri[1], v2 stored in tri[2]
- float* pVert0 = (float*)&tri[1];
- float* pVert1 = (float*)&tri[0];
- float* pVert2 = (float*)&tri[2];
- float* pVert3 = (float*)&tri[1];
- for (uint32_t c = 0; c < 4; ++c)
- {
- pVert3[c] = pVert1[c] + pVert2[c] - pVert0[c];
- }
- }
- }
-
- uint32_t NumPrims() { return this->numPrimsAssembled; }
-
- // Per-topology functions
- void ProcessVertTriStrip(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 3)
- {
- // assembled enough verts for prim, add to gather indices
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- if (reverseWinding)
- {
- this->indices[1][this->numPrimsAssembled] = this->vert[2];
- this->indices[2][this->numPrimsAssembled] = this->vert[1];
- }
- else
- {
- this->indices[1][this->numPrimsAssembled] = this->vert[1];
- this->indices[2][this->numPrimsAssembled] = this->vert[2];
- }
-
- // increment numPrimsAssembled
- this->numPrimsAssembled++;
-
- // set up next prim state
- this->vert[0] = this->vert[1];
- this->vert[1] = this->vert[2];
- this->curIndex = 2;
- this->reverseWinding ^= 1;
- }
- }
-
- template <bool gsEnabled>
- void AssembleTriStripAdj()
- {
- if (!gsEnabled)
- {
- this->vert[1] = this->vert[2];
- this->vert[2] = this->vert[4];
-
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->indices[1][this->numPrimsAssembled] = this->vert[1];
- this->indices[2][this->numPrimsAssembled] = this->vert[2];
-
- this->vert[4] = this->vert[2];
- this->vert[2] = this->vert[1];
- }
- else
- {
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->indices[1][this->numPrimsAssembled] = this->vert[1];
- this->indices[2][this->numPrimsAssembled] = this->vert[2];
- this->indices[3][this->numPrimsAssembled] = this->vert[3];
- this->indices[4][this->numPrimsAssembled] = this->vert[4];
- this->indices[5][this->numPrimsAssembled] = this->vert[5];
- }
- this->numPrimsAssembled++;
- }
-
- template <bool gsEnabled>
- void ProcessVertTriStripAdj(uint32_t index, bool finish)
- {
- // handle last primitive of tristrip
- if (finish && this->adjExtraVert != -1)
- {
- this->vert[3] = this->adjExtraVert;
- AssembleTriStripAdj<gsEnabled>();
- this->adjExtraVert = -1;
- return;
- }
-
- switch (this->curIndex)
- {
- case 0:
- case 1:
- case 2:
- case 4:
- this->vert[this->curIndex] = index;
- this->curIndex++;
- break;
- case 3:
- this->vert[5] = index;
- this->curIndex++;
- break;
- case 5:
- if (this->adjExtraVert == -1)
- {
- this->adjExtraVert = index;
- }
- else
- {
- this->vert[3] = index;
- if (!gsEnabled)
- {
- AssembleTriStripAdj<gsEnabled>();
-
- uint32_t nextTri[6];
- if (this->reverseWinding)
- {
- nextTri[0] = this->vert[4];
- nextTri[1] = this->vert[0];
- nextTri[2] = this->vert[2];
- nextTri[4] = this->vert[3];
- nextTri[5] = this->adjExtraVert;
- }
- else
- {
- nextTri[0] = this->vert[2];
- nextTri[1] = this->adjExtraVert;
- nextTri[2] = this->vert[3];
- nextTri[4] = this->vert[4];
- nextTri[5] = this->vert[0];
- }
- for (uint32_t i = 0; i < 6; ++i)
- {
- this->vert[i] = nextTri[i];
- }
-
- this->adjExtraVert = -1;
- this->reverseWinding ^= 1;
- }
- else
- {
- this->curIndex++;
- }
- }
- break;
- case 6:
- SWR_ASSERT(this->adjExtraVert != -1, "Algorithm failure!");
- AssembleTriStripAdj<gsEnabled>();
-
- uint32_t nextTri[6];
- if (this->reverseWinding)
- {
- nextTri[0] = this->vert[4];
- nextTri[1] = this->vert[0];
- nextTri[2] = this->vert[2];
- nextTri[4] = this->vert[3];
- nextTri[5] = this->adjExtraVert;
- }
- else
- {
- nextTri[0] = this->vert[2];
- nextTri[1] = this->adjExtraVert;
- nextTri[2] = this->vert[3];
- nextTri[4] = this->vert[4];
- nextTri[5] = this->vert[0];
- }
- for (uint32_t i = 0; i < 6; ++i)
- {
- this->vert[i] = nextTri[i];
- }
- this->reverseWinding ^= 1;
- this->adjExtraVert = index;
- this->curIndex--;
- break;
- }
- }
-
- void ProcessVertTriList(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 3)
- {
- // assembled enough verts for prim, add to gather indices
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->indices[1][this->numPrimsAssembled] = this->vert[1];
- this->indices[2][this->numPrimsAssembled] = this->vert[2];
-
- // increment numPrimsAssembled
- this->numPrimsAssembled++;
-
- // set up next prim state
- this->curIndex = 0;
- }
- }
-
- void ProcessVertTriListAdj(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 6)
- {
- // assembled enough verts for prim, add to gather indices
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->indices[1][this->numPrimsAssembled] = this->vert[1];
- this->indices[2][this->numPrimsAssembled] = this->vert[2];
- this->indices[3][this->numPrimsAssembled] = this->vert[3];
- this->indices[4][this->numPrimsAssembled] = this->vert[4];
- this->indices[5][this->numPrimsAssembled] = this->vert[5];
-
- // increment numPrimsAssembled
- this->numPrimsAssembled++;
-
- // set up next prim state
- this->curIndex = 0;
- }
- }
-
- void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 6)
- {
- // assembled enough verts for prim, add to gather indices
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->indices[1][this->numPrimsAssembled] = this->vert[2];
- this->indices[2][this->numPrimsAssembled] = this->vert[4];
-
- // increment numPrimsAssembled
- this->numPrimsAssembled++;
-
- // set up next prim state
- this->curIndex = 0;
- }
- }
-
- void ProcessVertLineList(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 2)
- {
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->indices[1][this->numPrimsAssembled] = this->vert[1];
-
- this->numPrimsAssembled++;
- this->curIndex = 0;
- }
- }
-
- void ProcessVertLineStrip(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 2)
- {
- // assembled enough verts for prim, add to gather indices
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->indices[1][this->numPrimsAssembled] = this->vert[1];
-
- // increment numPrimsAssembled
- this->numPrimsAssembled++;
-
- // set up next prim state
- this->vert[0] = this->vert[1];
- this->curIndex = 1;
- }
- }
-
- void ProcessVertLineStripAdj(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 4)
- {
- // assembled enough verts for prim, add to gather indices
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->indices[1][this->numPrimsAssembled] = this->vert[1];
- this->indices[2][this->numPrimsAssembled] = this->vert[2];
- this->indices[3][this->numPrimsAssembled] = this->vert[3];
-
- // increment numPrimsAssembled
- this->numPrimsAssembled++;
-
- // set up next prim state
- this->vert[0] = this->vert[1];
- this->vert[1] = this->vert[2];
- this->vert[2] = this->vert[3];
- this->curIndex = 3;
- }
- }
-
- void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 4)
- {
- // assembled enough verts for prim, add to gather indices
- this->indices[0][this->numPrimsAssembled] = this->vert[1];
- this->indices[1][this->numPrimsAssembled] = this->vert[2];
-
- // increment numPrimsAssembled
- this->numPrimsAssembled++;
-
- // set up next prim state
- this->vert[0] = this->vert[1];
- this->vert[1] = this->vert[2];
- this->vert[2] = this->vert[3];
- this->curIndex = 3;
- }
- }
-
- void ProcessVertLineListAdj(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 4)
- {
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->indices[1][this->numPrimsAssembled] = this->vert[1];
- this->indices[2][this->numPrimsAssembled] = this->vert[2];
- this->indices[3][this->numPrimsAssembled] = this->vert[3];
-
- this->numPrimsAssembled++;
- this->curIndex = 0;
- }
- }
-
- void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 4)
- {
- this->indices[0][this->numPrimsAssembled] = this->vert[1];
- this->indices[1][this->numPrimsAssembled] = this->vert[2];
-
- this->numPrimsAssembled++;
- this->curIndex = 0;
- }
- }
-
- void ProcessVertPointList(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 1)
- {
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->numPrimsAssembled++;
- this->curIndex = 0;
- }
- }
-
- void ProcessVertRectList(uint32_t index, bool finish)
- {
- this->vert[this->curIndex] = index;
- this->curIndex++;
- if (this->curIndex == 3)
- {
- // assembled enough verts for prim, add to gather indices
- this->indices[0][this->numPrimsAssembled] = this->vert[0];
- this->indices[1][this->numPrimsAssembled] = this->vert[1];
- this->indices[2][this->numPrimsAssembled] = this->vert[2];
-
- // second triangle in the rectangle
- // v1, v3 = v1 + v2 - v0, v2
- this->indices[0][this->numPrimsAssembled + 1] = this->vert[1];
- this->indices[1][this->numPrimsAssembled + 1] = this->vert[0];
- this->indices[2][this->numPrimsAssembled + 1] = this->vert[2];
-
- // increment numPrimsAssembled
- this->numPrimsAssembled += 2;
-
- // set up next prim state
- this->curIndex = 0;
- }
- }
-};
-
-// Primitive Assembly for data output from the DomainShader.
-struct PA_TESS : PA_STATE
-{
- PA_TESS(DRAW_CONTEXT* in_pDC,
- const SIMDSCALAR* in_pVertData,
- uint32_t in_attributeStrideInVectors,
- uint32_t in_vertexStride,
- uint32_t in_numAttributes,
- uint32_t* (&in_ppIndices)[3],
- uint32_t in_numPrims,
- PRIMITIVE_TOPOLOGY in_binTopology,
- uint32_t numVertsPerPrim,
- bool SOA = true) :
-
- PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
- m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors),
- m_numAttributes(in_numAttributes), m_numPrims(in_numPrims), m_SOA(SOA)
- {
-#if USE_SIMD16_FRONTEND
- m_vPrimId = _simd16_setzero_si();
-#else
- m_vPrimId = _simd_setzero_si();
-#endif
- binTopology = in_binTopology;
- m_ppIndices[0] = in_ppIndices[0];
- m_ppIndices[1] = in_ppIndices[1];
- m_ppIndices[2] = in_ppIndices[2];
-
- switch (binTopology)
- {
- case TOP_POINT_LIST:
- m_numVertsPerPrim = 1;
- break;
-
- case TOP_LINE_LIST:
- m_numVertsPerPrim = 2;
- break;
-
- case TOP_TRIANGLE_LIST:
- m_numVertsPerPrim = 3;
- break;
-
- default:
- SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
- break;
- }
- }
-
- bool HasWork() { return m_numPrims != 0; }
-
- simdvector& GetSimdVector(uint32_t index, uint32_t slot)
- {
- SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
- return junkVector;
- }
-
-#if ENABLE_AVX512_SIMD16
- simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
- {
- SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
- return junkVector_simd16;
- }
-
-#endif
- static SIMDSCALARI GenPrimMask(uint32_t numPrims)
- {
- SWR_ASSERT(numPrims <= SIMD_WIDTH);
-#if USE_SIMD16_FRONTEND
- static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
- return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
-#else
- static const OSALIGNLINE(int32_t)
- maskGen[SIMD_WIDTH * 2] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
-
- return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
-#endif
- }
-
- bool Assemble(uint32_t slot, simdvector verts[])
- {
- SWR_ASSERT(slot < m_numAttributes);
-
- uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
- if (0 == numPrimsToAssemble)
- {
- return false;
- }
-
- SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
-
- const float* pBaseAttrib;
- if (m_SOA)
- {
- pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
- }
- else
- {
- const float* pVertData = (const float*)m_pVertexData;
- pBaseAttrib = pVertData + slot * 4;
- }
-
- for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
- {
-#if USE_SIMD16_FRONTEND
- SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
-#else
- SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
-#endif
-
- const float* pBase = pBaseAttrib;
- for (uint32_t c = 0; c < 4; ++c)
- {
-#if USE_SIMD16_FRONTEND
- simd16scalar temp =
- _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
- pBase,
- indices,
- _simd16_castsi_ps(mask),
- 4 /* gcc doesn't like sizeof(float) */);
-
- verts[i].v[c] =
- useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
-#else
- verts[i].v[c] = _simd_mask_i32gather_ps(_simd_setzero_ps(),
- pBase,
- indices,
- _simd_castsi_ps(mask),
- 4); // gcc doesn't like sizeof(float)
-#endif
- if (m_SOA)
- {
- pBase += m_attributeStrideInVectors * SIMD_WIDTH;
- }
- else
- {
- pBase += sizeof(float);
- }
- }
- }
-
- return true;
- }
-
-#if ENABLE_AVX512_SIMD16
- bool Assemble(uint32_t slot, simd16vector verts[])
- {
- SWR_ASSERT(slot < m_numAttributes);
-
- uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
- if (0 == numPrimsToAssemble)
- {
- return false;
- }
-
- SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
-
- const float* pBaseAttrib;
- if (m_SOA)
- {
- pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
- }
- else
- {
- const float* pVertData = (const float*)m_pVertexData;
- pBaseAttrib = pVertData + slot * 4;
- }
-
- for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
- {
-#if USE_SIMD16_FRONTEND
- SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
- if (!m_SOA)
- {
- indices = _simd16_mullo_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
- }
-#else
- SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
-#endif
-
- const float* pBase = pBaseAttrib;
- for (uint32_t c = 0; c < 4; ++c)
- {
-#if USE_SIMD16_FRONTEND
- verts[i].v[c] = _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
- pBase,
- indices,
- _simd16_castsi_ps(mask),
- 4 /* gcc doesn't like sizeof(float) */);
-#else
- simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(),
- pBase,
- indices,
- _simd_castsi_ps(mask),
- 4 /* gcc doesn't like sizeof(float) */);
- verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
-#endif
- if (m_SOA)
- {
- pBase += m_attributeStrideInVectors * SIMD_WIDTH;
- }
- else
- {
- pBase++;
- }
- }
- }
-
- return true;
- }
-
-#endif
- void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
- {
- SWR_ASSERT(slot < m_numAttributes);
-
-
- SWR_ASSERT(primIndex < PA_TESS::NumPrims());
-
- const float* pVertDataBase;
- if (m_SOA)
- {
- pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
- }
- else
- {
- const float* pVertData = (const float*)m_pVertexData;
- pVertDataBase = pVertData + slot * 4;
- };
- for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
- {
-#if USE_SIMD16_FRONTEND
- uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2]
- : m_ppIndices[i][primIndex];
- if (!m_SOA)
- {
- index *= (vertexStride / 4);
- }
-#else
- uint32_t index = m_ppIndices[i][primIndex];
-#endif
- const float* pVertData = pVertDataBase;
- float* pVert = (float*)&verts[i];
-
- for (uint32_t c = 0; c < 4; ++c)
- {
- pVert[c] = pVertData[index];
- if (m_SOA)
- {
- pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
- }
- else
- {
- pVertData++;
- }
- }
-
- }
- }
-
- bool NextPrim()
- {
- uint32_t numPrims = PA_TESS::NumPrims();
- m_numPrims -= numPrims;
- m_ppIndices[0] += numPrims;
- m_ppIndices[1] += numPrims;
- m_ppIndices[2] += numPrims;
-
- return HasWork();
- }
-
- SIMDVERTEX& GetNextVsOutput()
- {
- SWR_NOT_IMPL;
- return junkVertex;
- }
-
- bool GetNextStreamOutput()
- {
- SWR_NOT_IMPL;
- return false;
- }
-
- SIMDMASK& GetNextVsIndices()
- {
- SWR_NOT_IMPL;
- return junkIndices;
- }
-
- uint32_t NumPrims() { return std::min<uint32_t>(m_numPrims, SIMD_WIDTH); }
-
- void Reset() { SWR_NOT_IMPL; }
-
- SIMDSCALARI GetPrimID(uint32_t startID)
- {
-#if USE_SIMD16_FRONTEND
- return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
-#else
- return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
-#endif
- }
-
-private:
- const SIMDSCALAR* m_pVertexData = nullptr;
- uint32_t m_attributeStrideInVectors = 0;
- uint32_t m_numAttributes = 0;
- uint32_t m_numPrims = 0;
- uint32_t* m_ppIndices[3];
-
- uint32_t m_numVertsPerPrim = 0;
-
- SIMDSCALARI m_vPrimId;
-
- simdvector junkVector; // junk simdvector for unimplemented API
-#if ENABLE_AVX512_SIMD16
- simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
-#endif
- SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API
- SIMDMASK junkIndices; // temporary index store for unused virtual function
-
- bool m_SOA;
-};
-
-// Primitive Assembler factory class, responsible for creating and initializing the correct
-// assembler based on state.
-template <typename IsIndexedT, typename IsCutIndexEnabledT>
-struct PA_FACTORY
-{
- PA_FACTORY(DRAW_CONTEXT* pDC,
- PRIMITIVE_TOPOLOGY in_topo,
- uint32_t numVerts,
- PA_STATE::SIMDVERTEX* pVertexStore,
- uint32_t vertexStoreSize,
- uint32_t vertexStride,
- uint32_t numVertsPerPrim) :
- topo(in_topo)
- {
-#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
- const API_STATE& state = GetApiState(pDC);
- if ((IsIndexedT::value && IsCutIndexEnabledT::value &&
- (topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST ||
- topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) ||
-
- // non-indexed draws with adjacency topologies must use cut-aware PA until we add
- // support for them in the optimized PA
- (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
- topo == TOP_TRI_STRIP_ADJ))
- {
- memset(&indexStore, 0, sizeof(indexStore));
- uint32_t numAttribs = state.feNumAttributes;
-
- new (&this->paCut) PA_STATE_CUT(pDC,
- reinterpret_cast<uint8_t*>(pVertexStore),
- vertexStoreSize * PA_STATE::SIMD_WIDTH,
- vertexStride,
- &this->indexStore[0],
- numVerts,
- numAttribs,
- state.topology,
- false,
- numVertsPerPrim);
- cutPA = true;
- }
- else
-#endif
- {
- uint32_t numPrims = GetNumPrims(in_topo, numVerts);
- new (&this->paOpt) PA_STATE_OPT(pDC,
- numPrims,
- reinterpret_cast<uint8_t*>(pVertexStore),
- vertexStoreSize * PA_STATE::SIMD_WIDTH,
- vertexStride,
- false,
- numVertsPerPrim);
- cutPA = false;
- }
- }
-
- PA_STATE& GetPA()
- {
-#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
- if (cutPA)
- {
- return this->paCut;
- }
- else
-#endif
- {
- return this->paOpt;
- }
- }
-
- PA_STATE_OPT paOpt;
- PA_STATE_CUT paCut;
-
- bool cutPA{false};
-
- PRIMITIVE_TOPOLOGY topo{TOP_UNKNOWN};
-
- PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
-};