src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377

/****************************************************************************
* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once

#if !defined(__cplusplus)
#error C++ compilation required
#endif

#include <immintrin.h>
#include <inttypes.h>
#include <stdint.h>

#define SIMD_ARCH_AVX       0
#define SIMD_ARCH_AVX2      1
#define SIMD_ARCH_AVX512    2

#if !defined(SIMD_ARCH)
#define SIMD_ARCH SIMD_ARCH_AVX
#endif

#if defined(_MSC_VER)
#define SIMDCALL __vectorcall
#define SIMDINLINE __forceinline
#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
#else
#define SIMDCALL
#define SIMDINLINE inline
#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
#endif

// For documentation, please see the following include...
// #include "simdlib_interface.hpp"

namespace SIMDImpl
{
    enum class CompareType
    {
        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
        LT_OS      = 0x01, // Less-than (ordered, signaling)
        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
        UNORD_Q    = 0x03, // Unordered (nonsignaling)
        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
        ORD_Q      = 0x07, // Ordered (nonsignaling)
        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
        EQ_OS      = 0x10, // Equal (ordered, signaling)
        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
        UNORD_S    = 0x13, // Unordered (signaling)
        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
        ORD_S      = 0x17, // Ordered (signaling)
        EQ_US      = 0x18, // Equal (unordered, signaling)
        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
        FALSE_OS   = 0x1B, // False (ordered, signaling)
        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
        TRUE_US    = 0x1F, // True (unordered, signaling)
    };

#if SIMD_ARCH >= SIMD_ARCH_AVX512
    enum class CompareTypeInt
    {
        EQ  = _MM_CMPINT_EQ,    // Equal
        LT  = _MM_CMPINT_LT,    // Less than
        LE  = _MM_CMPINT_LE,    // Less than or Equal
        NE  = _MM_CMPINT_NE,    // Not Equal
        GE  = _MM_CMPINT_GE,    // Greater than or Equal
        GT  = _MM_CMPINT_GT,    // Greater than
    };
#endif // SIMD_ARCH >= SIMD_ARCH_AVX512

    enum class ScaleFactor
    {
        SF_1 = 1,   // No scaling
        SF_2 = 2,   // Scale offset by 2
        SF_4 = 4,   // Scale offset by 4
        SF_8 = 8,   // Scale offset by 8
    };

    enum class RoundMode
    {
        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
        TO_NEG_INF      = 0x01, // Round to negative infinity
        TO_POS_INF      = 0x02, // Round to positive infinity
        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
        
        RAISE_EXC       = 0x00, // Raise exception on overflow
        NO_EXC          = 0x08, // Suppress exceptions
        
        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
    };

    struct Traits
    {
        using CompareType = SIMDImpl::CompareType;
        using ScaleFactor = SIMDImpl::ScaleFactor;
        using RoundMode   = SIMDImpl::RoundMode;
    };

    // Attribute, 4-dimensional attribute in SIMD SOA layout
    template<typename Float, typename Integer, typename Double>
    union Vec4
    {
        Float   v[4];
        Integer vi[4];
        Double  vd[4];
        struct
        {
            Float  x;
            Float  y;
            Float  z;
            Float  w;
        };
        SIMDINLINE Float& operator[] (const int i) { return v[i]; }
        SIMDINLINE Float const & operator[] (const int i) const { return v[i]; }
        SIMDINLINE Vec4& operator=(Vec4 const & in)
        {
            v[0] = in.v[0];
            v[1] = in.v[1];
            v[2] = in.v[2];
            v[3] = in.v[3];
            return *this;
        }
    };

    namespace SIMD128Impl
    {
        union Float
        {
            SIMDINLINE Float() = default;
            SIMDINLINE Float(__m128 in) : v(in) {}
            SIMDINLINE Float& operator=(__m128 in) { v = in; return *this; }
            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
            SIMDINLINE operator __m128() const { return v; }

            SIMDALIGN(__m128, 16) v;
        };

        union Integer
        {
            SIMDINLINE Integer() = default;
            SIMDINLINE Integer(__m128i in) : v(in) {}
            SIMDINLINE Integer& operator=(__m128i in) { v = in; return *this; }
            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
            SIMDINLINE operator __m128i() const { return v; }
            SIMDALIGN(__m128i, 16) v;
        };

        union Double
        {
            SIMDINLINE Double() = default;
            SIMDINLINE Double(__m128d in) : v(in) {}
            SIMDINLINE Double& operator=(__m128d in) { v = in; return *this; }
            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
            SIMDINLINE operator __m128d() const { return v; }
            SIMDALIGN(__m128d, 16) v;
        };

        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
        using Mask = uint8_t;

        static const uint32_t SIMD_WIDTH = 4;
    } // ns SIMD128Impl

    namespace SIMD256Impl
    {
        union Float
        {
            SIMDINLINE Float() = default;
            SIMDINLINE Float(__m256 in) : v(in) {}
            SIMDINLINE Float(SIMD128Impl::Float in_lo, SIMD128Impl::Float in_hi = _mm_setzero_ps())
            {
                v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
            }
            SIMDINLINE Float& operator=(__m256 in) { v = in; return *this; }
            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
            SIMDINLINE operator __m256() const { return v; }

            SIMDALIGN(__m256, 32) v;
            SIMD128Impl::Float v4[2];
        };

        union Integer
        {
            SIMDINLINE Integer() = default;
            SIMDINLINE Integer(__m256i in) : v(in) {}
            SIMDINLINE Integer(SIMD128Impl::Integer in_lo, SIMD128Impl::Integer in_hi = _mm_setzero_si128())
            {
                v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
            }
            SIMDINLINE Integer& operator=(__m256i in) { v = in; return *this; }
            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
            SIMDINLINE operator __m256i() const { return v; }

            SIMDALIGN(__m256i, 32) v;
            SIMD128Impl::Integer v4[2];
        };

        union Double
        {
            SIMDINLINE Double() = default;
            SIMDINLINE Double(__m256d in) : v(in) {}
            SIMDINLINE Double(SIMD128Impl::Double in_lo, SIMD128Impl::Double in_hi = _mm_setzero_pd())
            {
                v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
            }
            SIMDINLINE Double& operator=(__m256d in) { v = in; return *this; }
            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
            SIMDINLINE operator __m256d() const { return v; }

            SIMDALIGN(__m256d, 32) v;
            SIMD128Impl::Double v4[2];
        };

        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
        using Mask = uint8_t;

        static const uint32_t SIMD_WIDTH = 8;
    } // ns SIMD256Impl

    namespace SIMD512Impl
    {
#if !defined(_MM_K0_REG)
        // Define AVX512 types if not included via immintrin.h.
        // All data members of these types are ONLY to viewed
        // in a debugger.  Do NOT access them via code!
        union __m512
        {
        private:
            float m512_f32[16];
        };
        struct __m512d
        {
        private:
            double m512d_f64[8];
        };

        union __m512i
        {
        private:
            int8_t              m512i_i8[64];
            int16_t             m512i_i16[32];
            int32_t             m512i_i32[16];
            int64_t             m512i_i64[8];
            uint8_t             m512i_u8[64];
            uint16_t            m512i_u16[32];
            uint32_t            m512i_u32[16];
            uint64_t            m512i_u64[8];
        };

        using __mmask16 = uint16_t;
#endif

#if SIMD_ARCH >= SIMD_ARCH_AVX512
#define SIMD_ALIGNMENT_BYTES 64
#else
#define SIMD_ALIGNMENT_BYTES 32
#endif

        union Float
        {
            SIMDINLINE Float() = default;
            SIMDINLINE Float(__m512 in) : v(in) {}
            SIMDINLINE Float(SIMD256Impl::Float in_lo, SIMD256Impl::Float in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
            SIMDINLINE Float& operator=(__m512 in) { v = in; return *this; }
            SIMDINLINE Float& operator=(Float const & in)
            {
#if SIMD_ARCH >= SIMD_ARCH_AVX512
                v = in.v;
#else
                v8[0] = in.v8[0];
                v8[1] = in.v8[1];
#endif
                return *this;
            }
            SIMDINLINE operator __m512() const { return v; }

            SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
            SIMD256Impl::Float v8[2];
        };

        union Integer
        {
            SIMDINLINE Integer() = default;
            SIMDINLINE Integer(__m512i in) : v(in) {}
            SIMDINLINE Integer(SIMD256Impl::Integer in_lo, SIMD256Impl::Integer in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
            SIMDINLINE Integer& operator=(__m512i in) { v = in; return *this; }
            SIMDINLINE Integer& operator=(Integer const & in)
            {
#if SIMD_ARCH >= SIMD_ARCH_AVX512
                v = in.v;
#else
                v8[0] = in.v8[0];
                v8[1] = in.v8[1];
#endif
                return *this;
            }

            SIMDINLINE operator __m512i() const { return v; }

            SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
            SIMD256Impl::Integer v8[2];
        };

        union Double
        {
            SIMDINLINE Double() = default;
            SIMDINLINE Double(__m512d in) : v(in) {}
            SIMDINLINE Double(SIMD256Impl::Double in_lo, SIMD256Impl::Double in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
            SIMDINLINE Double& operator=(__m512d in) { v = in; return *this; }
            SIMDINLINE Double& operator=(Double const & in)
            {
#if SIMD_ARCH >= SIMD_ARCH_AVX512
                v = in.v;
#else
                v8[0] = in.v8[0];
                v8[1] = in.v8[1];
#endif
                return *this;
            }

            SIMDINLINE operator __m512d() const { return v; }

            SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
            SIMD256Impl::Double v8[2];
        };

        typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
        using Mask = __mmask16;

        static const uint32_t SIMD_WIDTH = 16;

#undef SIMD_ALIGNMENT_BYTES
    } // ns SIMD512Impl
} // ns SIMDImpl