1 files changed, 581 insertions, 0 deletions
diff --git a/src/intel/vulkan/grl/gpu/intrinsics.h b/src/intel/vulkan/grl/gpu/intrinsics.h
new file mode 100644
index 00000000000..0dff3147d8a
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/intrinsics.h
@@ -0,0 +1,581 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+// TODO: AABB_work_group_reduce is super slow, remove !!!
+
+#pragma cl_intel_subgroups : enable
+#pragma cl_khr_fp16        : enable
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+
+uint intel_sub_group_ballot(bool valid);
+
+// atom_min
+float __attribute__((overloadable)) atom_min(volatile __global float *p, float val);
+float __attribute__((overloadable)) atom_min(volatile __local float *p, float val);
+float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val);
+float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val);
+// atom_max
+float __attribute__((overloadable)) atom_max(volatile __global float *p, float val);
+float __attribute__((overloadable)) atom_max(volatile __local float *p, float val);
+float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val);
+float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val);
+// atom_cmpxchg
+float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val);
+float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val);
+float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val);
+float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val);
+
+
+
+inline uint subgroup_single_atomic_add(global uint *p, uint val)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0;
+    return sub_group_broadcast(v, 0);
+}
+
+inline float halfarea(const float3 d)
+{
+    return fma(d.x, (d.y + d.z), d.y * d.z);
+}
+
+inline float area(const float3 d)
+{
+    return halfarea(d) * 2.0f;
+}
+
+inline uint maxDim(const float3 a)
+{
+    const float3 b = fabs(a);
+    const bool b_x_y = b.x > b.y;
+    const float cur_max = b_x_y ? b.x : b.y;
+    const uint cur_idx = b_x_y ? 0 : 1;
+    const bool b_x_y_z = b.z > cur_max;
+    return b_x_y_z ? 2 : cur_idx;
+}
+
+inline uint3 sortByMaxDim(const float3 a)
+{
+    const uint kz = maxDim(a);
+    const uint _kx = (kz + 1) % 3;
+    const uint _ky = (_kx + 1) % 3;
+    const bool kz_pos = a[kz] >= 0.0f;
+    const uint kx = kz_pos ? _ky : _kx;
+    const uint ky = kz_pos ? _kx : _ky;
+    return (uint3)(kx, ky, kz);
+}
+
+inline uint4 sort4_ascending(const uint4 dist)
+{
+    const uint a0 = dist.s0;
+    const uint a1 = dist.s1;
+    const uint a2 = dist.s2;
+    const uint a3 = dist.s3;
+    const uint b0 = min(a0, a2);
+    const uint b1 = min(a1, a3);
+    const uint b2 = max(a0, a2);
+    const uint b3 = max(a1, a3);
+    const uint c0 = min(b0, b1);
+    const uint c1 = max(b0, b1);
+    const uint c2 = min(b2, b3);
+    const uint c3 = max(b2, b3);
+    const uint d0 = c0;
+    const uint d1 = min(c1, c2);
+    const uint d2 = max(c1, c2);
+    const uint d3 = c3;
+    return (uint4)(d0, d1, d2, d3);
+}
+
+__constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4};
+__constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0};
+__constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5};
+__constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6};
+
+__constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1};
+__constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1};
+__constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1};
+
+__constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1};
+
+inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask)
+{
+    const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const uint a_min = min(a0, a1);
+    const uint a_max = max(a0, a1);
+    return select(a_max, a_min, selectMask);
+}
+
+inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask)
+{
+    const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const uint a_min = min(a0, a1);
+    const uint a_max = max(a0, a1);
+    return select(a_min, a_max, selectMask);
+}
+
+inline uint sort8_descending(const uint aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
+    const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
+    const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]);
+    const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]);
+    const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]);
+    const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]);
+    return gg;
+}
+
+inline uint sort8_ascending(const uint aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]);
+    const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]);
+    const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]);
+    const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]);
+    const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]);
+    const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]);
+    return gg;
+}
+
+inline uint sort4_descending(const uint aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
+    const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
+    const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]);
+    return dd;
+}
+
+inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
+{
+    const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const ulong a_min = min(a0, a1);
+    const ulong a_max = max(a0, a1);
+    return select(a_max, a_min, (ulong)selectMask);
+}
+
+inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
+{
+    const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const ulong a_min = min(a0, a1);
+    const ulong a_max = max(a0, a1);
+    return select(a_min, a_max, (ulong)selectMask);
+}
+
+inline ulong sort8_ascending_ulong(const ulong aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]);
+    const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]);
+    const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]);
+    const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]);
+    const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]);
+    const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]);
+    return gg;
+}
+
+inline uint bitInterleave3D(const uint4 in)
+{
+    uint x = in.x, y = in.y, z = in.z;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x << 8)) & 0x0300F00F;
+    x = (x | (x << 4)) & 0x030C30C3;
+    x = (x | (x << 2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y << 8)) & 0x0300F00F;
+    y = (y | (y << 4)) & 0x030C30C3;
+    y = (y | (y << 2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z << 8)) & 0x0300F00F;
+    z = (z | (z << 4)) & 0x030C30C3;
+    z = (z | (z << 2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+}
+
+inline uint bitInterleave4D(const uint4 in)
+{
+    uint x = in.x, y = in.y, z = in.z, w = in.w;
+
+    x = x & 0x000000ff;
+    x = (x ^ (x << 16)) & 0x00c0003f;
+    x = (x ^ (x << 8)) & 0x00c03807;
+    x = (x ^ (x << 4)) & 0x08530853;
+    x = (x ^ (x << 2)) & 0x09090909;
+    x = (x ^ (x << 1)) & 0x11111111;
+
+    y = y & 0x000000ff;
+    y = (y ^ (y << 16)) & 0x00c0003f;
+    y = (y ^ (y << 8)) & 0x00c03807;
+    y = (y ^ (y << 4)) & 0x08530853;
+    y = (y ^ (y << 2)) & 0x09090909;
+    y = (y ^ (y << 1)) & 0x11111111;
+
+    z = z & 0x000000ff;
+    z = (z ^ (z << 16)) & 0x00c0003f;
+    z = (z ^ (z << 8)) & 0x00c03807;
+    z = (z ^ (z << 4)) & 0x08530853;
+    z = (z ^ (z << 2)) & 0x09090909;
+    z = (z ^ (z << 1)) & 0x11111111;
+
+    w = w & 0x000000ff;
+    w = (w ^ (w << 16)) & 0x00c0003f;
+    w = (w ^ (w << 8)) & 0x00c03807;
+    w = (w ^ (w << 4)) & 0x08530853;
+    w = (w ^ (w << 2)) & 0x09090909;
+    w = (w ^ (w << 1)) & 0x11111111;
+
+    return (x | (y << 1) | (z << 2) | (w << 3));
+}
+
+inline ulong ulong_bitInterleave4D(const uint4 in)
+{
+    ulong x = in.x, y = in.y, z = in.z, w = in.w;
+
+    x = x & 0x0000ffff;
+    x = (x ^ (x << 32)) & 0x0000f800000007ff;
+    x = (x ^ (x << 16)) & 0x0000f80007c0003f;
+    x = (x ^ (x << 8)) & 0x00c0380700c03807;
+    x = (x ^ (x << 4)) & 0x0843084308430843;
+    x = (x ^ (x << 2)) & 0x0909090909090909;
+    x = (x ^ (x << 1)) & 0x1111111111111111;
+
+    y = y & 0x0000ffff;
+    y = (y ^ (y << 32)) & 0x0000f800000007ff;
+    y = (y ^ (y << 16)) & 0x0000f80007c0003f;
+    y = (y ^ (y << 8)) & 0x00c0380700c03807;
+    y = (y ^ (y << 4)) & 0x0843084308430843;
+    y = (y ^ (y << 2)) & 0x0909090909090909;
+    y = (y ^ (y << 1)) & 0x1111111111111111;
+
+    z = z & 0x0000ffff;
+    z = (z ^ (z << 32)) & 0x0000f800000007ff;
+    z = (z ^ (z << 16)) & 0x0000f80007c0003f;
+    z = (z ^ (z << 8)) & 0x00c0380700c03807;
+    z = (z ^ (z << 4)) & 0x0843084308430843;
+    z = (z ^ (z << 2)) & 0x0909090909090909;
+    z = (z ^ (z << 1)) & 0x1111111111111111;
+
+    w = w & 0x0000ffff;
+    w = (w ^ (w << 32)) & 0x0000f800000007ff;
+    w = (w ^ (w << 16)) & 0x0000f80007c0003f;
+    w = (w ^ (w << 8)) & 0x00c0380700c03807;
+    w = (w ^ (w << 4)) & 0x0843084308430843;
+    w = (w ^ (w << 2)) & 0x0909090909090909;
+    w = (w ^ (w << 1)) & 0x1111111111111111;
+
+    return (x | (y << 1) | (z << 2) | (w << 3));
+}
+
+inline uint bitCompact(uint x)
+{
+    x &= 0x09249249;
+    x = (x ^ (x >> 2)) & 0x030c30c3;
+    x = (x ^ (x >> 4)) & 0x0300f00f;
+    x = (x ^ (x >> 8)) & 0xff0000ff;
+    x = (x ^ (x >> 16)) & 0x000003ff;
+    return x;
+}
+
+inline uint3 bitCompact3D(const uint in)
+{
+    const uint x = bitCompact(x >> 0);
+    const uint y = bitCompact(y >> 1);
+    const uint z = bitCompact(z >> 2);
+    return (uint3)(x, y, z);
+}
+
+inline uint convertToPushIndices8(uint ID)
+{
+    const unsigned int slotID = get_sub_group_local_id();
+    uint index = 0;
+    for (uint i = 0; i < 8; i++)
+    {
+        const uint mask = intel_sub_group_ballot(ID == i);
+        const uint new_index = ctz(mask);
+        index = i == slotID ? new_index : index;
+    }
+    return index;
+}
+
+inline uint convertToPushIndices16(uint ID)
+{
+    const unsigned int slotID = get_sub_group_local_id();
+    uint index = 0;
+    for (uint i = 0; i < 16; i++)
+    {
+        const uint mask = intel_sub_group_ballot(ID == i);
+        const uint new_index = ctz(mask);
+        index = i == slotID ? new_index : index;
+    }
+    return index;
+}
+
+#define FLOAT_EXPONENT_MASK     (0x7F800000)  // used to be EXPONENT_MASK
+#define FLOAT_MANTISSA_MASK     (0x007FFFFF)  // used to be MANTISSA_MASK
+#define FLOAT_NEG_ONE_EXP_MASK  (0x3F000000)
+#define FLOAT_BIAS              (127)
+#define FLOAT_MANTISSA_BITS     (23)
+
+inline float3 frexp_vec3(float3 len, int3* exp)
+{
+    float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK));
+    mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f)));
+    mant = copysign(mant, len);
+    *exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1));
+    return mant;
+}
+
+
+#ifndef uniform
+#define uniform
+#endif
+
+#ifndef varying
+#define varying
+#endif
+
+uint get_sub_group_global_id()
+{
+    return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 );
+}
+
+// each lane contains the number of 1 bits below the corresponding position in 'mask'
+uint subgroup_bit_prefix_exclusive(uniform uint mask)
+{
+    varying ushort lane = get_sub_group_local_id();
+    varying uint lane_mask = (1 << lane) - 1;
+    varying uint m = mask & lane_mask;
+    return popcount(m);
+}
+
+uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx )
+{
+    varying uint lane_mask = (1 << lane_idx) - 1;
+    varying uint m = mask & lane_mask;
+    return popcount(m);
+}
+
+
+uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx)
+{
+    return (uint3)(sub_group_broadcast(v.x,idx),
+                   sub_group_broadcast(v.y,idx),
+                   sub_group_broadcast(v.z,idx));
+}
+
+float3 sub_group_broadcast_float3(float3 v, uniform ushort idx)
+{
+    return (float3)(sub_group_broadcast(v.x, idx),
+                    sub_group_broadcast(v.y, idx),
+                    sub_group_broadcast(v.z, idx));
+}
+
+float3 sub_group_reduce_min_float3(float3 v)
+{
+    return (float3)(sub_group_reduce_min(v.x),
+                    sub_group_reduce_min(v.y),
+                    sub_group_reduce_min(v.z) );
+}
+float3 sub_group_reduce_max_float3(float3 v)
+{
+    return (float3)(sub_group_reduce_max(v.x),
+                    sub_group_reduce_max(v.y),
+                    sub_group_reduce_max(v.z));
+}
+
+float3 sub_group_shuffle_float3(float3 v, uniform ushort idx)
+{
+    return (float3)(intel_sub_group_shuffle(v.x, idx),
+                    intel_sub_group_shuffle(v.y, idx),
+                    intel_sub_group_shuffle(v.z, idx));
+}
+uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx)
+{
+    return (uint3)( intel_sub_group_shuffle(v.x, idx),
+                    intel_sub_group_shuffle(v.y, idx),
+                    intel_sub_group_shuffle(v.z, idx));
+}
+
+
+inline uchar sub_group_reduce_or_N6(uchar val)
+{
+    val = val | intel_sub_group_shuffle_down(val, val, 4);
+    val = val | intel_sub_group_shuffle_down(val, val, 2);
+    val = val | intel_sub_group_shuffle_down(val, val, 1);
+    return sub_group_broadcast(val, 0);
+}
+
+inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val)
+{
+    uint SIMD8_id = get_sub_group_local_id() / 8;
+    val = val | intel_sub_group_shuffle_down(val, val, 4);
+    val = val | intel_sub_group_shuffle_down(val, val, 2);
+    val = val | intel_sub_group_shuffle_down(val, val, 1);
+
+    return intel_sub_group_shuffle(val, SIMD8_id * 8);
+}
+
+
+inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p )
+{
+    return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group );
+}
+
+inline __attribute__((overloadable)) int atomic_inc_local(local int* p)
+{
+    return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p)
+{
+    return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) int atomic_dec_local(local int* p)
+{
+    return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n)
+{
+    return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n )
+{
+    return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_add_local( local uint* p, uint n )
+{
+    return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_xor_local(local uint* p, uint n)
+{
+    return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_or_local(local uint* p, uint n)
+{
+    return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_min_local(local uint* p, uint n)
+{
+    return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_max_local(local uint* p, uint n)
+{
+    return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+
+
+
+inline uint atomic_inc_global( global uint* p )
+{
+    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_dec_global(global uint* p)
+{
+    return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
+}
+
+inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired)
+{
+    return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_add_global( global uint* p, uint n )
+{
+    return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_sub_global(global uint* p, uint n)
+{
+    return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_or_global(global uint* p, uint n)
+{
+    return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+
+inline uint atomic_inc_global_acquire(global uint* p)
+{
+    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device);
+}
+
+
+inline uint atomic_inc_global_release(global uint* p)
+{
+    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
+}
+inline uint atomic_dec_global_release(global uint* p)
+{
+    return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
+}
+
+inline uint generic_atomic_add(uint* p, uint val)
+{
+    if (to_global(p) != NULL)
+        return atomic_add_global(to_global(p), val);
+    if (to_local(p) != NULL)
+        return atomic_add_local(to_local(p), val);
+    return 0;
+}
+
+inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n )
+{
+    n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
+    return sub_group_broadcast( n, 0 );
+}
+
+inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n )
+{
+    n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
+    return sub_group_broadcast( n, 0 );
+}
+
+inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n)
+{
+    n = max(n, intel_sub_group_shuffle_down(n, n, 4));
+    n = max(n, intel_sub_group_shuffle_down(n, n, 2));
+    n = max(n, intel_sub_group_shuffle_down(n, n, 1));
+    return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0);
+}
+
+inline uint generic_atomic_inc(uint* p)
+{
+    if (to_global(p) != NULL)
+        return atomic_inc_global(to_global(p));
+    if (to_local(p) != NULL)
+        return atomic_inc(to_local(p));
+    return 0;
+}
+
+
+// Built-in GRL function which, if called in a kernel body, will force the kernel
+//  to be compiled to the minimum SIMD width supported by the platform
+void GRL_UseMinimumSIMDWidth();
+\ No newline at end of file