summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/swr/rasterizer/core/context.h
blob: 9da7962826ca2fa1b100204f466889d1fbe3a7c2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
/****************************************************************************
* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file context.h
*
* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
*        The SWR_CONTEXT is our global context and contains the DC ring,
*        thread state, etc.
*
*        The DRAW_CONTEXT contains all state associated with a draw operation.
*
******************************************************************************/
#pragma once

#include <condition_variable>
#include <algorithm>

#include "core/api.h"
#include "core/utils.h"
#include "core/arena.h"
#include "core/fifo.hpp"
#include "core/knobs.h"
#include "common/simdintrin.h"
#include "core/threads.h"
#include "ringbuffer.h"
#include "archrast/archrast.h"

// x.8 fixed point precision values
#define FIXED_POINT_SHIFT 8
#define FIXED_POINT_SCALE 256

// x.16 fixed point precision values
#define FIXED_POINT16_SHIFT 16
#define FIXED_POINT16_SCALE 65536

struct SWR_CONTEXT;
struct DRAW_CONTEXT;

struct TRI_FLAGS
{
    uint32_t frontFacing : 1;
    uint32_t yMajor : 1;
    uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
    uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
    float pointSize;
    uint32_t primID;
    uint32_t renderTargetArrayIndex;
    uint32_t viewportIndex;
};

//////////////////////////////////////////////////////////////////////////
/// SWR_TRIANGLE_DESC
/////////////////////////////////////////////////////////////////////////
struct SWR_TRIANGLE_DESC
{
    float I[3];
    float J[3];
    float Z[3];
    float OneOverW[3];
    float recipDet;

    float *pRecipW;
    float *pAttribs;
    float *pPerspAttribs;
    float *pSamplePos;
    float *pUserClipBuffer;

    uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
    uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered
    uint64_t anyCoveredSamples;

    TRI_FLAGS triFlags;
};

struct TRIANGLE_WORK_DESC
{
    float *pTriBuffer;
    float *pAttribs;
    float *pUserClipBuffer;
    uint32_t numAttribs;
    TRI_FLAGS triFlags;
};

struct CLEAR_DESC
{
    SWR_RECT rect;
    uint32_t attachmentMask;
    uint32_t renderTargetArrayIndex;
    float clearRTColor[4];  // RGBA_32F
    float clearDepth;   // [0..1]
    uint8_t clearStencil;
};

struct DISCARD_INVALIDATE_TILES_DESC
{
    uint32_t attachmentMask;
    SWR_RECT rect;
    SWR_TILE_STATE newTileState;
    bool createNewTiles;
    bool fullTilesOnly;
};

struct SYNC_DESC
{
    PFN_CALLBACK_FUNC pfnCallbackFunc;
    uint64_t userData;
    uint64_t userData2;
    uint64_t userData3;
};

struct STORE_TILES_DESC
{
    uint32_t attachmentMask;
    SWR_TILE_STATE postStoreTileState;
    SWR_RECT rect;
};

struct COMPUTE_DESC
{
    uint32_t threadGroupCountX;
    uint32_t threadGroupCountY;
    uint32_t threadGroupCountZ;
};

typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);

enum WORK_TYPE
{
    SYNC,
    DRAW,
    CLEAR,
    DISCARDINVALIDATETILES,
    STORETILES,
    SHUTDOWN,
};

OSALIGNSIMD(struct) BE_WORK
{
    WORK_TYPE type;
    PFN_WORK_FUNC pfnWork;
    union
    {
        SYNC_DESC sync;
        TRIANGLE_WORK_DESC tri;
        CLEAR_DESC clear;
        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
        STORE_TILES_DESC storeTiles;
    } desc;
};

struct DRAW_WORK
{
    DRAW_CONTEXT*   pDC;
    union
    {
        uint32_t   numIndices;      // DrawIndexed: Number of indices for draw.
        uint32_t   numVerts;        // Draw: Number of verts (triangles, lines, etc)
    };
    union
    {
        const int32_t* pIB;        // DrawIndexed: App supplied indices
        uint32_t   startVertex;    // Draw: Starting vertex in VB to render from.
    };
    int32_t    baseVertex;
    uint32_t   numInstances;        // Number of instances
    uint32_t   startInstance;       // Instance offset
    uint32_t   startPrimID;         // starting primitiveID for this draw batch
    uint32_t   startVertexID;       // starting VertexID for this draw batch (only needed for non-indexed draws)
    SWR_FORMAT type;                // index buffer type
};

typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
struct FE_WORK
{
    WORK_TYPE type;
    PFN_FE_WORK_FUNC pfnWork;
    union
    {
        SYNC_DESC sync;
        DRAW_WORK draw;
        CLEAR_DESC clear;
        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
        STORE_TILES_DESC storeTiles;
    } desc;
};

struct GUARDBANDS
{
    float left[KNOB_NUM_VIEWPORTS_SCISSORS];
    float right[KNOB_NUM_VIEWPORTS_SCISSORS];
    float top[KNOB_NUM_VIEWPORTS_SCISSORS];
    float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
};

struct PA_STATE;

// function signature for pipeline stages that execute after primitive assembly
typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], 
    uint32_t primMask, simdscalari primID, simdscalari viewportIdx);

#if ENABLE_AVX512_SIMD16
// function signature for pipeline stages that execute after primitive assembly
typedef void(*PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
    uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);

#endif
OSALIGNLINE(struct) API_STATE
{
    // Vertex Buffers
    SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];

    // Index Buffer
    SWR_INDEX_BUFFER_STATE  indexBuffer;

    // FS - Fetch Shader State
    PFN_FETCH_FUNC          pfnFetchFunc;

    // VS - Vertex Shader State
    PFN_VERTEX_FUNC         pfnVertexFunc;

    // GS - Geometry Shader State
    PFN_GS_FUNC             pfnGsFunc;
    SWR_GS_STATE            gsState;

    // CS - Compute Shader
    PFN_CS_FUNC             pfnCsFunc;
    uint32_t                totalThreadsInGroup;
    uint32_t                totalSpillFillSize;

    // FE - Frontend State
    SWR_FRONTEND_STATE      frontendState;

    // SOS - Streamout Shader State
    PFN_SO_FUNC             pfnSoFunc[MAX_SO_STREAMS];

    // Streamout state
    SWR_STREAMOUT_STATE     soState;
    mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];

    // Tessellation State
    PFN_HS_FUNC             pfnHsFunc;
    PFN_DS_FUNC             pfnDsFunc;
    SWR_TS_STATE            tsState;

    // Number of attributes used by the frontend (vs, so, gs)
    uint32_t                feNumAttributes;

    PRIMITIVE_TOPOLOGY      topology;
    bool                    forceFront;

    // RS - Rasterizer State
    SWR_RASTSTATE           rastState;
    // floating point multisample offsets
    float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];

    GUARDBANDS               gbState;

    SWR_VIEWPORT            vp[KNOB_NUM_VIEWPORTS_SCISSORS];
    SWR_VIEWPORT_MATRICES   vpMatrices;

    SWR_RECT                scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
    SWR_RECT                scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
    bool                    scissorsTileAligned;

    // Backend state
    SWR_BACKEND_STATE       backendState;

    SWR_DEPTH_BOUNDS_STATE  depthBoundsState;

    // PS - Pixel shader state
    SWR_PS_STATE            psState;

    SWR_DEPTH_STENCIL_STATE depthStencilState;

    // OM - Output Merger State
    SWR_BLEND_STATE         blendState;
    PFN_BLEND_JIT_FUNC      pfnBlendFunc[SWR_NUM_RENDERTARGETS];

    struct
    {
        uint32_t enableStatsFE : 1;             // Enable frontend pipeline stats
        uint32_t enableStatsBE : 1;             // Enable backend pipeline stats
        uint32_t colorHottileEnable : 8;        // Bitmask of enabled color hottiles
        uint32_t depthHottileEnable: 1;         // Enable depth buffer hottile
        uint32_t stencilHottileEnable : 1;      // Enable stencil buffer hottile
    };

    PFN_QUANTIZE_DEPTH      pfnQuantizeDepth;
};

class MacroTileMgr;
class DispatchQueue;

struct RenderOutputBuffers
{
    uint8_t* pColor[SWR_NUM_RENDERTARGETS];
    uint8_t* pDepth;
    uint8_t* pStencil;
};

// Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
struct BarycentricCoeffs
{
    simdscalar vIa;
    simdscalar vIb;
    simdscalar vIc;

    simdscalar vJa;
    simdscalar vJb;
    simdscalar vJc;

    simdscalar vZa;
    simdscalar vZb;
    simdscalar vZc;

    simdscalar vRecipDet;

    simdscalar vAOneOverW;
    simdscalar vBOneOverW;
    simdscalar vCOneOverW;
};

// pipeline function pointer types
typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
                                 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar);
typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
                                              const simdscalar, const simdscalar);

struct BACKEND_FUNCS
{
    PFN_BACKEND_FUNC pfnBackend;
};

// Draw State
struct DRAW_STATE
{
    API_STATE state;

    void* pPrivateState;  // Its required the driver sets this up for each draw.

    // pipeline function pointers, filled in by API thread when setting up the draw
    BACKEND_FUNCS backendFuncs;
    PFN_PROCESS_PRIMS pfnProcessPrims;

    CachingArena* pArena;     // This should only be used by API thread.
};

struct DRAW_DYNAMIC_STATE
{
    void Reset(uint32_t numThreads)
    {
        SWR_STATS* pSavePtr = pStats;
        memset(this, 0, sizeof(*this));
        pStats = pSavePtr;
        memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
    }
    ///@todo Currently assumes only a single FE can do stream output for a draw.
    uint32_t SoWriteOffset[4];
    bool     SoWriteOffsetDirty[4];

    SWR_STATS_FE statsFE;   // Only one FE thread per DC.
    SWR_STATS*   pStats;
};

// Draw Context
//    The api thread sets up a draw context that exists for the life of the draw.
//    This draw context maintains all of the state needed for the draw operation.
struct DRAW_CONTEXT
{
    SWR_CONTEXT*    pContext;
    union
    {
        MacroTileMgr*   pTileMgr;
        DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
    };
    DRAW_STATE*     pState;             // Read-only state. Core should not update this outside of API thread.
    DRAW_DYNAMIC_STATE dynState;

    CachingArena*   pArena;

    uint32_t        drawId;
    bool            dependentFE;    // Frontend work is dependent on all previous FE
    bool            dependent;      // Backend work is dependent on all previous BE
    bool            isCompute;      // Is this DC a compute context?
    bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
    volatile bool   doneFE;         // Is FE work done for this draw?

    FE_WORK         FeWork;

    volatile OSALIGNLINE(uint32_t)   FeLock;
    volatile int32_t    threadsDone;

    SYNC_DESC       retireCallback; // Call this func when this DC is retired.


};

static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");

INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
{
    SWR_ASSERT(pDC != nullptr);
    SWR_ASSERT(pDC->pState != nullptr);

    return pDC->pState->state;
}

INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
{
    SWR_ASSERT(pDC != nullptr);
    SWR_ASSERT(pDC->pState != nullptr);

    return pDC->pState->pPrivateState;
}

class HotTileMgr;

struct SWR_CONTEXT
{
    // Draw Context Ring
    //  Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
    //  We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
    //  of draws that can be in flight at any given time.
    //
    //  Description:
    //  1. State - When an application first sets state we'll request a new draw context to use.
    //     a. If there are no available draw contexts then we'll have to wait until one becomes free.
    //     b. If one is available then set pCurDrawContext to point to it and mark it in use.
    //     c. All state calls set state on pCurDrawContext.
    //  2. Draw - Creates submits a work item that is associated with current draw context.
    //     a. Set pPrevDrawContext = pCurDrawContext
    //     b. Set pCurDrawContext to NULL.
    //  3. State - When an applications sets state after draw
    //     a. Same as step 1.
    //     b. State is copied from prev draw context to current.
    RingBuffer<DRAW_CONTEXT> dcRing;

    DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
    DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.

    MacroTileMgr* pMacroTileManagerArray;
    DispatchQueue* pDispatchQueueArray;

    // Draw State Ring
    //  When draw are very large (lots of primitives) then the API thread will break these up.
    //  These split draws all have identical state. So instead of storing the state directly
    //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
    //  to reference a single entry in the DS ring.
    RingBuffer<DRAW_STATE> dsRing;

    uint32_t curStateId;               // Current index to the next available entry in the DS ring.

    uint32_t NumWorkerThreads;
    uint32_t NumFEThreads;
    uint32_t NumBEThreads;

    THREAD_POOL threadPool; // Thread pool associated with this context
    SWR_THREADING_INFO threadInfo;

    std::condition_variable FifosNotEmpty;
    std::mutex WaitLock;

    uint32_t privateStateSize;

    HotTileMgr *pHotTileMgr;

    // Callback functions, passed in at create context time
    PFN_LOAD_TILE               pfnLoadTile;
    PFN_STORE_TILE              pfnStoreTile;
    PFN_CLEAR_TILE              pfnClearTile;
    PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
    PFN_UPDATE_STATS            pfnUpdateStats;
    PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;


    // Global Stats
    SWR_STATS* pStats;

    // Scratch space for workers.
    uint8_t** ppScratch;

    volatile int32_t  drawsOutstandingFE;

    CachingAllocator cachingArenaAllocator;
    uint32_t frameCount;

    uint32_t lastFrameChecked;
    uint64_t lastDrawChecked;
    TileSet singleThreadLockedTiles;

    // ArchRast thread contexts.
    HANDLE* pArContext;
};

#define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; }
#define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; }

// ArchRast instrumentation framework
#define AR_WORKER_CTX  pContext->pArContext[workerId]
#define AR_API_CTX     pContext->pArContext[pContext->NumWorkerThreads]

#ifdef KNOB_ENABLE_AR
    #define _AR_BEGIN(ctx, type, id)    ArchRast::Dispatch(ctx, ArchRast::Start(ArchRast::type, id))
    #define _AR_END(ctx, type, count)   ArchRast::Dispatch(ctx, ArchRast::End(ArchRast::type, count))
    #define _AR_EVENT(ctx, event)       ArchRast::Dispatch(ctx, ArchRast::event)
    #define _AR_FLUSH(ctx, id)          ArchRast::FlushDraw(ctx, id)
#else
    #ifdef KNOB_ENABLE_RDTSC
        #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type)
        #define _AR_END(ctx, type, id)   RDTSC_STOP(type, id, 0)
    #else
        #define _AR_BEGIN(ctx, type, id) (void)ctx
        #define _AR_END(ctx, type, id)
    #endif
    #define _AR_EVENT(ctx, event)
    #define _AR_FLUSH(ctx, id)
#endif

// Use these macros for api thread.
#define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id)
#define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count)
#define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)

// Use these macros for worker threads.
#define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id)
#define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count)
#define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
#define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)