diff options
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl')
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl | 842 |
1 files changed, 842 insertions, 0 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl new file mode 100644 index 00000000000..a45429f4b6b --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl @@ -0,0 +1,842 @@ +/**************************************************************************** +* Copyright (C) 2017 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ +#if !defined(__SIMD_LIB_AVX_HPP__) +#error Do not include this file directly, use "simdlib.hpp" instead. +#endif + +//============================================================================ +// SIMD16 AVX (1) implementation +//============================================================================ + +static const int TARGET_SIMD_WIDTH = 8; +using SIMD128T = SIMD128Impl::AVXImpl; + +#define SIMD_WRAPPER_1(op) \ + static SIMDINLINE Float SIMDCALL op(Float a) \ + {\ + return Float\ + {\ + SIMD256T::op(a.v8[0]),\ + SIMD256T::op(a.v8[1]),\ + };\ + } + +#define SIMD_WRAPPER_2(op) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + {\ + return Float\ + {\ + SIMD256T::op(a.v8[0], b.v8[0]),\ + SIMD256T::op(a.v8[1], b.v8[1]),\ + };\ + } + +#define SIMD_WRAPPER_2I(op) \ + template<int ImmT>\ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + {\ + return Float\ + {\ + SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\ + SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\ + };\ + } + +#define SIMD_WRAPPER_2I_1(op) \ + template<int ImmT>\ + static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ + {\ + return Float\ + {\ + SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\ + SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\ + };\ + } + +#define SIMD_WRAPPER_3(op) \ + static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ + {\ + return Float\ + {\ + SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\ + SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\ + };\ + } + +#define SIMD_IWRAPPER_1(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer a) \ + {\ + return Integer\ + {\ + SIMD256T::op(a.v8[0]),\ + SIMD256T::op(a.v8[1]),\ + };\ + } + +#define SIMD_IWRAPPER_2(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + {\ + return Integer\ + {\ + SIMD256T::op(a.v8[0], b.v8[0]),\ + SIMD256T::op(a.v8[1], b.v8[1]),\ + };\ + } + +#define SIMD_IWRAPPER_2I(op) \ + template<int ImmT>\ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + {\ + return Integer\ + {\ + SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\ + SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\ + };\ + } + +#define SIMD_IWRAPPER_2I_1(op) \ + template<int ImmT>\ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + {\ + return Integer\ + {\ + SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\ + SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\ + };\ + } + +#define SIMD_IWRAPPER_2I_2(op) \ + template<int ImmT>\ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ + {\ + return Integer\ + {\ + SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\ + SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\ + };\ + } + +#define SIMD_IWRAPPER_3(op) \ + static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c) \ + {\ + return Integer\ + {\ + SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\ + SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\ + };\ + } + +//----------------------------------------------------------------------- +// Single precision floating point arithmetic operations +//----------------------------------------------------------------------- +SIMD_WRAPPER_2(add_ps); // return a + b +SIMD_WRAPPER_2(div_ps); // return a / b +SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c +SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c +SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b +SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b +SIMD_WRAPPER_2(mul_ps); // return a * b +SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a +SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) +SIMD_WRAPPER_2(sub_ps); // return a - b + +template <RoundMode RMT> +static SIMDINLINE Float SIMDCALL round_ps(Float a) +{ + return Float + { + SIMD256T::template round_ps<RMT>(a.v8[0]), + SIMD256T::template round_ps<RMT>(a.v8[1]), + }; +} + +static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); } +static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); } + +//----------------------------------------------------------------------- +// Integer (various width) arithmetic operations +//----------------------------------------------------------------------- +SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) +SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) +SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) +SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) +SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) +SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) +SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) +SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) + +// return (a * b) & 0xFFFFFFFF +// +// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, +// and store the low 32 bits of the intermediate integers in dst. +SIMD_IWRAPPER_2(mullo_epi32); +SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) +SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) +SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) + +//----------------------------------------------------------------------- +// Logical operations +//----------------------------------------------------------------------- +SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) +SIMD_IWRAPPER_2(and_si); // return a & b (int) +SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) +SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int) +SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) +SIMD_IWRAPPER_2(or_si); // return a | b (int) +SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) +SIMD_IWRAPPER_2(xor_si); // return a ^ b (int) + + +//----------------------------------------------------------------------- +// Shift operations +//----------------------------------------------------------------------- +template<int ImmT> +static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a) // return a << ImmT +{ + return Integer + { + SIMD256T::template slli_epi32<ImmT>(a.v8[0]), + SIMD256T::template slli_epi32<ImmT>(a.v8[1]), + }; +} + +SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32) + +template<int ImmT> +static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a) // return a >> ImmT (int32) +{ + return Integer + { + SIMD256T::template srai_epi32<ImmT>(a.v8[0]), + SIMD256T::template srai_epi32<ImmT>(a.v8[1]), + }; +} + +template<int ImmT> +static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a) // return a >> ImmT (uint32) +{ + return Integer + { + SIMD256T::template srli_epi32<ImmT>(a.v8[0]), + SIMD256T::template srli_epi32<ImmT>(a.v8[1]), + }; +} + +template<int ImmT> // for each 128-bit lane: +static SIMDINLINE Integer SIMDCALL srli_si(Integer a) // return a >> (ImmT*8) (uint) +{ + return Integer + { + SIMD256T::template srli_si<ImmT>(a.v8[0]), + SIMD256T::template srli_si<ImmT>(a.v8[1]), + }; +} +template<int ImmT> +static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) // same as srli_si, but with Float cast to int +{ + return Float + { + SIMD256T::template srlisi_ps<ImmT>(a.v8[0]), + SIMD256T::template srlisi_ps<ImmT>(a.v8[1]), + }; +} + +SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32) + +//----------------------------------------------------------------------- +// Conversion operations +//----------------------------------------------------------------------- +static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a) +{ + return Float + { + SIMD256T::castpd_ps(a.v8[0]), + SIMD256T::castpd_ps(a.v8[1]), + }; +} + +static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a) +{ + return Integer + { + SIMD256T::castps_si(a.v8[0]), + SIMD256T::castps_si(a.v8[1]), + }; +} + +static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a) +{ + return Double + { + SIMD256T::castsi_pd(a.v8[0]), + SIMD256T::castsi_pd(a.v8[1]), + }; +} + +static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a) +{ + return Double + { + SIMD256T::castps_pd(a.v8[0]), + SIMD256T::castps_pd(a.v8[1]), + }; +} + +static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a) +{ + return Float + { + SIMD256T::castsi_ps(a.v8[0]), + SIMD256T::castsi_ps(a.v8[1]), + }; +} + +static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float) +{ + return Float + { + SIMD256T::cvtepi32_ps(a.v8[0]), + SIMD256T::cvtepi32_ps(a.v8[1]), + }; +} + +static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a) // return (int16)a (uint8 --> int16) +{ + return Integer + { + SIMD256T::cvtepu8_epi16(a.v4[0]), + SIMD256T::cvtepu8_epi16(a.v4[1]), + }; +} + +static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a) // return (int32)a (uint8 --> int32) +{ + return Integer + { + SIMD256T::cvtepu8_epi32(a.v4[0]), + SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])), + }; +} + +static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a) // return (int32)a (uint16 --> int32) +{ + return Integer + { + SIMD256T::cvtepu16_epi32(a.v4[0]), + SIMD256T::cvtepu16_epi32(a.v4[1]), + }; +} + +static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a) // return (int64)a (uint16 --> int64) +{ + return Integer + { + SIMD256T::cvtepu16_epi64(a.v4[0]), + SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])), + }; +} + +static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a) // return (int64)a (uint32 --> int64) +{ + return Integer + { + SIMD256T::cvtepu32_epi64(a.v4[0]), + SIMD256T::cvtepu32_epi64(a.v4[1]), + }; +} + +static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32) +{ + return Integer + { + SIMD256T::cvtps_epi32(a.v8[0]), + SIMD256T::cvtps_epi32(a.v8[1]), + }; +} + +static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32) +{ + return Integer + { + SIMD256T::cvtps_epi32(a.v8[0]), + SIMD256T::cvtps_epi32(a.v8[1]), + }; +} + +//----------------------------------------------------------------------- +// Comparison operations +//----------------------------------------------------------------------- +template<CompareType CmpTypeT> +static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b +{ + return Float + { + SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]), + SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]), + }; +} +static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); } +static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); } +static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); } +static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); } +static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); } +static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); } + +template<CompareType CmpTypeT> +static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b) +{ + return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b))); +} + + +SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) +SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) +SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) +SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) +SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) +SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) +SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) +SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) +SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) + +static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float) +{ + return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & + SIMD256T::testz_ps(a.v8[1], b.v8[1])); +} + +static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int) +{ + return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & + SIMD256T::testz_si(a.v8[1], b.v8[1])); +} + +//----------------------------------------------------------------------- +// Blend / shuffle / permute operations +//----------------------------------------------------------------------- +SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) +SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32) +SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) +static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int) +{ + return Integer + { + SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]), + SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]), + }; +} + +static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int) +{ + return Integer + { + SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]), + SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]), + }; +} + +static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value) +{ + float f = *p; + return Float + { + SIMD256T::set1_ps(f), + SIMD256T::set1_ps(f), + }; +} + +template<int imm> +static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a) +{ + SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); + return a.v8[imm]; +} + +template<int imm> +static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a) +{ + SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); + return a.v8[imm]; +} + +template<int imm> +static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a) +{ + SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); + return a.v8[imm]; +} + +template<int imm> +static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b) +{ + SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); + a.v8[imm] = b; + return a; +} + +template<int imm> +static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b) +{ + SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); + a.v8[imm] = b; + return a; +} + +template<int imm> +static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b) +{ + SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); + a.v8[imm] = b; + return a; +} + +SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 +SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 +SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 +SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 + +static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32) +{ + Integer result; + + // Ugly slow implementation + uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a); + uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz); + uint32_t *pResult = reinterpret_cast<uint32_t *>(&result); + + for (uint32_t i = 0; i < SIMD_WIDTH; ++i) + { + pResult[i] = pA[0xF & pSwiz[i]]; + } + + return result; +} + +static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) +{ + Float result; + + // Ugly slow implementation + float const *pA = reinterpret_cast<float const*>(&a); + uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz); + float *pResult = reinterpret_cast<float *>(&result); + + for (uint32_t i = 0; i < SIMD_WIDTH; ++i) + { + pResult[i] = pA[0xF & pSwiz[i]]; + } + + return result; +} + +// All of the 512-bit permute2f128_XX intrinsics do the following: +// +// SELECT4(src, control) { +// CASE(control[1:0]) +// 0: tmp[127:0] : = src[127:0] +// 1 : tmp[127:0] : = src[255:128] +// 2 : tmp[127:0] : = src[383:256] +// 3 : tmp[127:0] : = src[511:384] +// ESAC +// RETURN tmp[127:0] +// } +// +// dst[127:0] : = SELECT4(a[511:0], imm8[1:0]) +// dst[255:128] : = SELECT4(a[511:0], imm8[3:2]) +// dst[383:256] : = SELECT4(b[511:0], imm8[5:4]) +// dst[511:384] : = SELECT4(b[511:0], imm8[7:6]) +// dst[MAX:512] : = 0 +// +// Since the 256-bit AVX instructions use a 4-bit control field (instead +// of 2-bit for AVX512), we need to expand the control bits sent to the +// AVX instructions for emulation. +// +template <int shuf> +static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b) +{ + return Float + { + SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]), + SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]), + }; +} + +template <int shuf> +static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b) +{ + return Double + { + SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]), + SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]), + }; +} + +template <int shuf> +static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b) +{ + return Integer + { + SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]), + SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]), + }; +} + +SIMD_IWRAPPER_2I_1(shuffle_epi32); +SIMD_IWRAPPER_2I_2(shuffle_epi64); +SIMD_IWRAPPER_2(shuffle_epi8); +SIMD_WRAPPER_2I_1(shuffle_pd); +SIMD_WRAPPER_2I_1(shuffle_ps); +SIMD_IWRAPPER_2(unpackhi_epi16); +SIMD_IWRAPPER_2(unpackhi_epi32); +SIMD_IWRAPPER_2(unpackhi_epi64); +SIMD_IWRAPPER_2(unpackhi_epi8); +SIMD_WRAPPER_2(unpackhi_pd); +SIMD_WRAPPER_2(unpackhi_ps); +SIMD_IWRAPPER_2(unpacklo_epi16); +SIMD_IWRAPPER_2(unpacklo_epi32); +SIMD_IWRAPPER_2(unpacklo_epi64); +SIMD_IWRAPPER_2(unpacklo_epi8); +SIMD_WRAPPER_2(unpacklo_pd); +SIMD_WRAPPER_2(unpacklo_ps); + +//----------------------------------------------------------------------- +// Load / store operations +//----------------------------------------------------------------------- +template<ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +{ + return Float + { + SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]), + SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]), + }; +} + +static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements) +{ + return broadcast_ss(p); +} + +static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) +{ + return Float + { + SIMD256T::load_ps(p), + SIMD256T::load_ps(p + TARGET_SIMD_WIDTH) + }; +} + +static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p +{ + return Integer + { + SIMD256T::load_si(&p->v8[0]), + SIMD256T::load_si(&p->v8[1]), + }; +} + +static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) +{ + return Float + { + SIMD256T::loadu_ps(p), + SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH) + }; +} + +static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) +{ + return Integer + { + SIMD256T::loadu_si(&p->v8[0]), + SIMD256T::loadu_si(&p->v8[1]), + }; +} + +// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old +template<ScaleFactor ScaleT> +static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) +{ + return Float + { + SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]), + SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]), + }; +} + +static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src) +{ + SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]); + SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]); +} + +static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) +{ + uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0])); + mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4); + + return mask; +} + +static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a) +{ + uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0])); + mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2); + + return mask; +} +static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a) +{ + uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0])); + mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH; + + return mask; +} + +static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) +{ + return Integer + { + SIMD256T::set1_epi32(i), + SIMD256T::set1_epi32(i) + }; +} + +static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) +{ + return Integer + { + SIMD256T::set1_epi8(i), + SIMD256T::set1_epi8(i) + }; +} + +static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) +{ + return Float + { + SIMD256T::set1_ps(f), + SIMD256T::set1_ps(f) + }; +} + +static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) +{ + return Float + { + SIMD256T::setzero_ps(), + SIMD256T::setzero_ps() + }; +} + +static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) +{ + return Integer + { + SIMD256T::setzero_si(), + SIMD256T::setzero_si() + }; +} + +static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory) +{ + SIMD256T::store_ps(p, a.v8[0]); + SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]); +} + +static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a +{ + SIMD256T::store_si(&p->v8[0], a.v8[0]); + SIMD256T::store_si(&p->v8[1], a.v8[1]); +} + +static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache) +{ + SIMD256T::stream_ps(p, a.v8[0]); + SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]); +} + +static SIMDINLINE Integer SIMDCALL set_epi32( + int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8, + int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) +{ + return Integer + { + SIMD256T::set_epi32( + i7, i6, i5, i4, i3, i2, i1, i0), + SIMD256T::set_epi32( + i15, i14, i13, i12, i11, i10, i9, i8) + }; +} + +static SIMDINLINE Integer SIMDCALL set_epi32( + int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) +{ + return set_epi32( + 0, 0, 0, 0, 0, 0, 0, 0, + i7, i6, i5, i4, i3, i2, i1, i0); +} + +static SIMDINLINE Float SIMDCALL set_ps( + float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8, + float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) +{ + return Float + { + SIMD256T::set_ps( + i7, i6, i5, i4, i3, i2, i1, i0), + SIMD256T::set_ps( + i15, i14, i13, i12, i11, i10, i9, i8) + }; +} + +static SIMDINLINE Float SIMDCALL set_ps( + float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) +{ + return set_ps( + 0, 0, 0, 0, 0, 0, 0, 0, + i7, i6, i5, i4, i3, i2, i1, i0); +} + +static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) +{ + Integer vec = set1_epi32(mask); + const Integer bit = set_epi32( + 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, + 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + vec = and_si(vec, bit); + vec = cmplt_epi32(setzero_si(), vec); + return castsi_ps(vec); +} + +#undef SIMD_WRAPPER_1 +#undef SIMD_WRAPPER_2 +#undef SIMD_WRAPPER_2I +#undef SIMD_WRAPPER_2I_1 +#undef SIMD_WRAPPER_3 +#undef SIMD_IWRAPPER_1 +#undef SIMD_IWRAPPER_2 +#undef SIMD_IWRAPPER_2I +#undef SIMD_IWRAPPER_2I_1 +#undef SIMD_IWRAPPER_3 |