1 files changed, 842 insertions, 0 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
new file mode 100644
index 00000000000..a45429f4b6b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -0,0 +1,842 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX (1) implementation
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 8;
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::op(a.v8[0]),\
+            SIMD256T::op(a.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2I_1(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+        static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+        {\
+            return Float\
+            {\
+                SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+                SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+            };\
+        }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0]),\
+            SIMD256T::op(a.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I_1(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I_2(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_3(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+        };\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return Float
+    {
+        SIMD256T::template round_ps<RMT>(a.v8[0]),
+        SIMD256T::template round_ps<RMT>(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
+SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
+SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
+SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a)      // return a << ImmT
+{
+    return Integer
+    {
+        SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+SIMD_IWRAPPER_2(sllv_epi32);                                // return a << b      (uint32)
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a)      // return a >> ImmT   (int32)
+{
+    return Integer
+    {
+        SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a)      // return a >> ImmT   (uint32)
+{
+    return Integer
+    {
+        SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+template<int ImmT>                                          // for each 128-bit lane:
+static SIMDINLINE Integer SIMDCALL srli_si(Integer a)         //  return a >> (ImmT*8) (uint)
+{
+    return Integer
+    {
+        SIMD256T::template srli_si<ImmT>(a.v8[0]),
+        SIMD256T::template srli_si<ImmT>(a.v8[1]),
+    };
+}
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)       // same as srli_si, but with Float cast to int
+{
+    return Float
+    {
+        SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
+        SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
+    };
+}
+
+SIMD_IWRAPPER_2(srlv_epi32);                                // return a >> b      (uint32)
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)              // return *(Float*)(&a)
+{
+    return Float
+    {
+        SIMD256T::castpd_ps(a.v8[0]),
+        SIMD256T::castpd_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)              // return *(Integer*)(&a)
+{
+    return Integer
+    {
+        SIMD256T::castps_si(a.v8[0]),
+        SIMD256T::castps_si(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)              // return *(Double*)(&a)
+{
+    return Double
+    {
+        SIMD256T::castsi_pd(a.v8[0]),
+        SIMD256T::castsi_pd(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return Double
+    {
+        SIMD256T::castps_pd(a.v8[0]),
+        SIMD256T::castps_pd(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)              // return *(Float*)(&a)
+{
+    return Float
+    {
+        SIMD256T::castsi_ps(a.v8[0]),
+        SIMD256T::castsi_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a)            // return (float)a    (int32 --> float)
+{
+    return Float
+    {
+        SIMD256T::cvtepi32_ps(a.v8[0]),
+        SIMD256T::cvtepi32_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a)          // return (int16)a    (uint8 --> int16)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu8_epi16(a.v4[0]),
+        SIMD256T::cvtepu8_epi16(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a)          // return (int32)a    (uint8 --> int32)
+{
+    return Integer
+	{
+        SIMD256T::cvtepu8_epi32(a.v4[0]),
+        SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
+	};
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a)         // return (int32)a    (uint16 --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu16_epi32(a.v4[0]),
+        SIMD256T::cvtepu16_epi32(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint16 --> int64)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu16_epi64(a.v4[0]),
+        SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint32 --> int64)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu32_epi64(a.v4[0]),
+        SIMD256T::cvtepu32_epi64(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtps_epi32(a.v8[0]),
+        SIMD256T::cvtps_epi32(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtps_epi32(a.v8[0]),
+        SIMD256T::cvtps_epi32(a.v8[1]),
+    };
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return Float
+    {
+        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
+        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
+    };
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+    return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
+}
+
+
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
+                  SIMD256T::testz_ps(a.v8[1], b.v8[1]));
+}
+
+static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
+                  SIMD256T::testz_si(a.v8[1], b.v8[1]));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return Integer
+    {
+        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return Integer
+    {
+        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)         // return *p (all elements in vector get same value)
+{
+    float f = *p;
+    return Float
+    {
+        SIMD256T::set1_ps(f),
+        SIMD256T::set1_ps(f),
+    };
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+SIMD_IWRAPPER_2(packs_epi16);      // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);      // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);     // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+    Integer result;
+
+    // Ugly slow implementation
+    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    Float result;
+
+    // Ugly slow implementation
+    float const *pA = reinterpret_cast<float const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    float *pResult = reinterpret_cast<float *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+// All of the 512-bit permute2f128_XX intrinsics do the following:
+//
+//      SELECT4(src, control) {
+//          CASE(control[1:0])
+//              0:	tmp[127:0] : = src[127:0]
+//              1 : tmp[127:0] : = src[255:128]
+//              2 : tmp[127:0] : = src[383:256]
+//              3 : tmp[127:0] : = src[511:384]
+//              ESAC
+//              RETURN tmp[127:0]
+//      }
+//      
+//      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
+//      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
+//      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
+//      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
+//      dst[MAX:512] : = 0
+//
+// Since the 256-bit AVX instructions use a 4-bit control field (instead
+// of 2-bit for AVX512), we need to expand the control bits sent to the
+// AVX instructions for emulation.
+//
+template <int shuf>
+static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
+{
+    return Float
+    {
+        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    };
+}
+
+template <int shuf>
+static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
+{
+    return Double
+    {
+        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    };
+}
+
+template <int shuf>
+static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
+{
+    return Integer
+	{
+        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+	};
+}
+
+SIMD_IWRAPPER_2I_1(shuffle_epi32);
+SIMD_IWRAPPER_2I_2(shuffle_epi64);
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_WRAPPER_2I_1(shuffle_pd);
+SIMD_WRAPPER_2I_1(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_WRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_WRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return Float
+    {
+        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
+        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return Float
+    {
+        SIMD256T::load_ps(p),
+        SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return Integer
+    {
+        SIMD256T::load_si(&p->v8[0]),
+        SIMD256T::load_si(&p->v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return Float
+    {
+        SIMD256T::loadu_ps(p),
+        SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return Integer
+    {
+        SIMD256T::loadu_si(&p->v8[0]),
+        SIMD256T::loadu_si(&p->v8[1]),
+    };
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    return Float
+    {
+        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
+        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
+    SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+    uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
+             mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
+
+    return mask;
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
+             mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
+
+    return mask;
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
+             mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
+
+    return mask;
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return Integer
+    {
+        SIMD256T::set1_epi32(i),
+        SIMD256T::set1_epi32(i)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return Integer
+    {
+        SIMD256T::set1_epi8(i),
+        SIMD256T::set1_epi8(i)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return Float
+    {
+        SIMD256T::set1_ps(f),
+        SIMD256T::set1_ps(f)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return Float
+    {
+        SIMD256T::setzero_ps(),
+        SIMD256T::setzero_ps()
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return Integer
+    {
+        SIMD256T::setzero_si(),
+        SIMD256T::setzero_si()
+    };
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    SIMD256T::store_ps(p, a.v8[0]);
+    SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    SIMD256T::store_si(&p->v8[0], a.v8[0]);
+    SIMD256T::store_si(&p->v8[1], a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    SIMD256T::stream_ps(p, a.v8[0]);
+    SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return Integer
+    {
+        SIMD256T::set_epi32(
+            i7, i6, i5, i4, i3, i2, i1, i0),
+        SIMD256T::set_epi32(
+            i15, i14, i13, i12, i11, i10, i9, i8)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return set_epi32(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return Float
+    {
+        SIMD256T::set_ps(
+            i7, i6, i5, i4, i3, i2, i1, i0),
+        SIMD256T::set_ps(
+            i15, i14, i13, i12, i11, i10, i9, i8)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return set_ps(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    Integer vec = set1_epi32(mask);
+    const Integer bit = set_epi32(
+        0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
+        0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = and_si(vec, bit);
+    vec = cmplt_epi32(setzero_si(), vec);
+    return castsi_ps(vec);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_2I_1
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_1
+#undef SIMD_IWRAPPER_3