summaryrefslogtreecommitdiff
path: root/backend/src/libocl/tmpl/ocl_simd.tmpl.h
blob: c609c2ec71df7e7bdf4d3f2be30cf671b7de0e5e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/*
 * Copyright © 2015 Intel Corporation
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library. If not, see <http://www.gnu.org/licenses/>.
 *
 */
#ifndef __OCL_SIMD_H__
#define __OCL_SIMD_H__

#include "ocl_types.h"

/////////////////////////////////////////////////////////////////////////////
// SIMD level function
/////////////////////////////////////////////////////////////////////////////
int sub_group_any(int);
int sub_group_all(int);

uint get_simd_size(void);

uint get_sub_group_size(void);
uint get_max_sub_group_size(void);
uint get_num_sub_groups(void);
uint get_sub_group_id(void);
uint get_sub_group_local_id(void);

/* broadcast */
OVERLOADABLE int sub_group_broadcast(int a,uint local_id);
OVERLOADABLE uint sub_group_broadcast(uint a, uint local_id);
OVERLOADABLE long sub_group_broadcast(long a, uint local_id);
OVERLOADABLE ulong sub_group_broadcast(ulong a, uint local_id);
OVERLOADABLE half sub_group_broadcast(half a, uint local_id);
OVERLOADABLE float sub_group_broadcast(float a, uint local_id);
OVERLOADABLE double sub_group_broadcast(double a, uint local_id);
OVERLOADABLE short sub_group_broadcast(short a,uint local_id);
OVERLOADABLE ushort sub_group_broadcast(ushort a, uint local_id);

OVERLOADABLE short intel_sub_group_broadcast(short a, uint local_id);
OVERLOADABLE ushort intel_sub_group_broadcast(ushort a, uint local_id);
/* reduce add */
OVERLOADABLE int sub_group_reduce_add(int x);
OVERLOADABLE uint sub_group_reduce_add(uint x);
OVERLOADABLE long sub_group_reduce_add(long x);
OVERLOADABLE ulong sub_group_reduce_add(ulong x);
OVERLOADABLE half sub_group_reduce_add(half x);
OVERLOADABLE float sub_group_reduce_add(float x);
OVERLOADABLE double sub_group_reduce_add(double x);

/* reduce min */
OVERLOADABLE int sub_group_reduce_min(int x);
OVERLOADABLE uint sub_group_reduce_min(uint x);
OVERLOADABLE long sub_group_reduce_min(long x);
OVERLOADABLE ulong sub_group_reduce_min(ulong x);
OVERLOADABLE half sub_group_reduce_min(half x);
OVERLOADABLE float sub_group_reduce_min(float x);
OVERLOADABLE double sub_group_reduce_min(double x);

/* reduce max */
OVERLOADABLE int sub_group_reduce_max(int x);
OVERLOADABLE uint sub_group_reduce_max(uint x);
OVERLOADABLE long sub_group_reduce_max(long x);
OVERLOADABLE ulong sub_group_reduce_max(ulong x);
OVERLOADABLE half sub_group_reduce_max(half x);
OVERLOADABLE float sub_group_reduce_max(float x);
OVERLOADABLE double sub_group_reduce_max(double x);

/* scan_inclusive add */
OVERLOADABLE int sub_group_scan_inclusive_add(int x);
OVERLOADABLE uint sub_group_scan_inclusive_add(uint x);
OVERLOADABLE long sub_group_scan_inclusive_add(long x);
OVERLOADABLE ulong sub_group_scan_inclusive_add(ulong x);
OVERLOADABLE half sub_group_scan_inclusive_add(half x);
OVERLOADABLE float sub_group_scan_inclusive_add(float x);
OVERLOADABLE double sub_group_scan_inclusive_add(double x);

/* scan_inclusive min */
OVERLOADABLE int sub_group_scan_inclusive_min(int x);
OVERLOADABLE uint sub_group_scan_inclusive_min(uint x);
OVERLOADABLE long sub_group_scan_inclusive_min(long x);
OVERLOADABLE ulong sub_group_scan_inclusive_min(ulong x);
OVERLOADABLE half sub_group_scan_inclusive_min(half x);
OVERLOADABLE float sub_group_scan_inclusive_min(float x);
OVERLOADABLE double sub_group_scan_inclusive_min(double x);

/* scan_inclusive max */
OVERLOADABLE int sub_group_scan_inclusive_max(int x);
OVERLOADABLE uint sub_group_scan_inclusive_max(uint x);
OVERLOADABLE long sub_group_scan_inclusive_max(long x);
OVERLOADABLE ulong sub_group_scan_inclusive_max(ulong x);
OVERLOADABLE half sub_group_scan_inclusive_max(half x);
OVERLOADABLE float sub_group_scan_inclusive_max(float x);
OVERLOADABLE double sub_group_scan_inclusive_max(double x);

/* scan_exclusive add */
OVERLOADABLE int sub_group_scan_exclusive_add(int x);
OVERLOADABLE uint sub_group_scan_exclusive_add(uint x);
OVERLOADABLE long sub_group_scan_exclusive_add(long x);
OVERLOADABLE ulong sub_group_scan_exclusive_add(ulong x);
OVERLOADABLE half sub_group_scan_exclusive_add(half x);
OVERLOADABLE float sub_group_scan_exclusive_add(float x);
OVERLOADABLE double sub_group_scan_exclusive_add(double x);

/* scan_exclusive min */
OVERLOADABLE int sub_group_scan_exclusive_min(int x);
OVERLOADABLE uint sub_group_scan_exclusive_min(uint x);
OVERLOADABLE long sub_group_scan_exclusive_min(long x);
OVERLOADABLE ulong sub_group_scan_exclusive_min(ulong x);
OVERLOADABLE half sub_group_scan_exclusive_min(half x);
OVERLOADABLE float sub_group_scan_exclusive_min(float x);
OVERLOADABLE double sub_group_scan_exclusive_min(double x);

/* scan_exclusive max */
OVERLOADABLE int sub_group_scan_exclusive_max(int x);
OVERLOADABLE uint sub_group_scan_exclusive_max(uint x);
OVERLOADABLE long sub_group_scan_exclusive_max(long x);
OVERLOADABLE ulong sub_group_scan_exclusive_max(ulong x);
OVERLOADABLE half sub_group_scan_exclusive_max(half x);
OVERLOADABLE float sub_group_scan_exclusive_max(float x);
OVERLOADABLE double sub_group_scan_exclusive_max(double x);

/* shuffle */
OVERLOADABLE half intel_sub_group_shuffle(half x, uint c);
OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint c);
OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint c);
OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint c);
OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, uint c);
OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint c);
OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint c);
OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c);
OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c);
OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c);

/* blocak read/write */
OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p);
OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p);
OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p);

OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data);
OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data);
OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data);
OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data);

OVERLOADABLE uint intel_sub_group_block_read(image2d_t image, int2 byte_coord);
OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t image, int2 byte_coord);
OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t image, int2 byte_coord);
OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t image, int2 byte_coord);

OVERLOADABLE void intel_sub_group_block_write(image2d_t image, int2 byte_coord, uint data);
OVERLOADABLE void intel_sub_group_block_write2(image2d_t image, int2 byte_coord, uint2 data);
OVERLOADABLE void intel_sub_group_block_write4(image2d_t image, int2 byte_coord, uint4 data);
OVERLOADABLE void intel_sub_group_block_write8(image2d_t image, int2 byte_coord, uint8 data);