summaryrefslogtreecommitdiff
path: root/backend/src/libocl/src/ocl_vload.cl
blob: bac0ed7c5d54c7fa4da848b2f984616e2b5e34d3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
/*
 * Copyright © 2012 - 2014 Intel Corporation
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library. If not, see <http://www.gnu.org/licenses/>.
 *
 */
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#include "ocl_vload.h"
#include "ocl_relational.h"

// These loads and stores will use untyped reads and writes, so we can just
// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
// Well we do not care, we do not activate TBAA in the compiler
#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
} \
OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
}

#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
}

#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
  *(p + 3 * offset) = v.s0; \
  *(p + 3 * offset + 1) = v.s1; \
  *(p + 3 * offset + 2) = v.s2; \
} \
OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
}

#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
}

#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
  DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)

#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
  DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
  DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
  DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
  DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
  DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)

#define DECL_UNTYPED_RW_ALL(TYPE) \
  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
  DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)

#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
  return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
} \
OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
  return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
} \
OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
  return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
} \
OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
  return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
} \
OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
  return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
}

#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
  *(p + 2 * offset) = v.s0; \
  *(p + 2 * offset + 1) = v.s1; \
} \
OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
  *(p + 3 * offset) = v.s0; \
  *(p + 3 * offset + 1) = v.s1; \
  *(p + 3 * offset + 2) = v.s2; \
} \
OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
  vstore2(v.lo, 2*offset, p); \
  vstore2(v.hi, 2*offset, p+2); \
} \
OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
  vstore4(v.lo, 2*offset, p); \
  vstore4(v.hi, 2*offset, p+4); \
} \
OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
  vstore8(v.lo, 2*offset, p); \
  vstore8(v.hi, 2*offset, p+8); \
}

#define DECL_BYTE_RW_ALL(TYPE) \
  DECL_BYTE_RD_SPACE(TYPE, __global) \
  DECL_BYTE_RD_SPACE(TYPE, __local) \
  DECL_BYTE_RD_SPACE(TYPE, __private) \
  DECL_BYTE_RD_SPACE(TYPE, __constant) \
  DECL_BYTE_WR_SPACE(TYPE, __global) \
  DECL_BYTE_WR_SPACE(TYPE, __local) \
  DECL_BYTE_WR_SPACE(TYPE, __private)

DECL_BYTE_RW_ALL(char)
DECL_BYTE_RW_ALL(half)
DECL_BYTE_RW_ALL(uchar)
DECL_BYTE_RW_ALL(short)
DECL_BYTE_RW_ALL(ushort)
DECL_UNTYPED_RW_ALL(int)
DECL_UNTYPED_RW_ALL(uint)
DECL_UNTYPED_RW_ALL(long)
DECL_UNTYPED_RW_ALL(ulong)
DECL_UNTYPED_RW_ALL(float)
DECL_UNTYPED_RW_ALL(double)

#undef DECL_UNTYPED_RW_ALL
#undef DECL_UNTYPED_RW_ALL_SPACE
#undef DECL_UNTYPED_RD_ALL_SPACE
#undef DECL_UNTYPED_RW_SPACE_N
#undef DECL_UNTYPED_RD_SPACE_N
#undef DECL_UNTYPED_V3_SPACE
#undef DECL_UNTYPED_RDV3_SPACE
#undef DECL_BYTE_RD_SPACE
#undef DECL_BYTE_WR_SPACE
#undef DECL_BYTE_RW_ALL

PURE CONST float __gen_ocl_f16to32(short h);
PURE CONST short __gen_ocl_f32to16(float f);

OVERLOADABLE short f32to16_rtp(float f) {
  short s = __gen_ocl_f32to16(f);
  float con = __gen_ocl_f16to32(s);
  //if(isinf(con)) return s;
  if (f > con)
    return s - signbit(f) * 2 + 1;
  else
    return s;
}

OVERLOADABLE short f32to16_rtn(float f) {
  short s = __gen_ocl_f32to16(f);
  float con = __gen_ocl_f16to32(s);
  //if(isinf(con)) return s;
  if (con > f)
    return s + signbit(f) * 2 - 1;
  else
    return s;
}

OVERLOADABLE short f32to16_rtz(float f) {
  short s = __gen_ocl_f32to16(f);
  float con = __gen_ocl_f16to32(s);
  //if(isinf(con)) return s;
  if (((con > f) && !signbit(f)) ||
      ((con < f) && signbit(f)))
    return s - 1;
  else
    return s;
}

#define DECL_HALF_LD_SPACE(SPACE) \
OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
  return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
} \
OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p) { \
  return vload_half(offset, p); \
} \
OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
  return (float2)(vload_half(offset*2, p), \
                  vload_half(offset*2 + 1, p)); \
} \
OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p) { \
  return (float2)(vloada_half(offset*2, p), \
                  vloada_half(offset*2 + 1, p)); \
} \
OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
  return (float3)(vload_half(offset*3, p), \
                  vload_half(offset*3 + 1, p), \
                  vload_half(offset*3 + 2, p)); \
} \
OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \
  return (float3)(vload_half(offset*4, p), \
                  vload_half(offset*4 + 1, p), \
                  vload_half(offset*4 + 2, p)); \
} \
OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
  return (float4)(vload_half2(offset*2, p), \
                  vload_half2(offset*2 + 1, p)); \
} \
OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p) { \
  return (float4)(vloada_half2(offset*2, p), \
                  vloada_half2(offset*2 + 1, p)); \
} \
OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
  return (float8)(vload_half4(offset*2, p), \
                  vload_half4(offset*2 + 1, p)); \
} \
OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p) { \
  return (float8)(vloada_half4(offset*2, p), \
                  vloada_half4(offset*2 + 1, p)); \
} \
OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \
  return (float16)(vload_half8(offset*2, p), \
                   vload_half8(offset*2 + 1, p)); \
}\
OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p) { \
  return (float16)(vloada_half8(offset*2, p), \
                   vloada_half8(offset*2 + 1, p)); \
}\

#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \
  *(SPACE short *)(p + offset) = FUNC(data); \
} \
OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p) { \
  vstore_half##ROUND(data, offset, p); \
} \
OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
  vstore_half##ROUND(data.lo, offset*2, p); \
  vstore_half##ROUND(data.hi, offset*2 + 1, p); \
} \
OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
  vstore_half2##ROUND(data, offset, p); \
} \
OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
  vstore_half##ROUND(data.s0, offset*3, p); \
  vstore_half##ROUND(data.s1, offset*3 + 1, p); \
  vstore_half##ROUND(data.s2, offset*3 + 2, p); \
} \
OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
  vstore_half##ROUND(data.s0, offset*4, p); \
  vstore_half##ROUND(data.s1, offset*4 + 1, p); \
  vstore_half##ROUND(data.s2, offset*4 + 2, p); \
} \
OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
  vstore_half2##ROUND(data.lo, offset*2, p); \
  vstore_half2##ROUND(data.hi, offset*2 + 1, p); \
} \
OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
  vstore_half4##ROUND(data, offset, p); \
} \
OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
  vstore_half4##ROUND(data.lo, offset*2, p); \
  vstore_half4##ROUND(data.hi, offset*2 + 1, p); \
} \
OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
  vstore_half8##ROUND(data, offset, p); \
} \
OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
  vstore_half8##ROUND(data.lo, offset*2, p); \
  vstore_half8##ROUND(data.hi, offset*2 + 1, p); \
} \
OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
  vstore_half16##ROUND(data, offset, p); \
}

#define DECL_HALF_ST_SPACE(SPACE) \
  DECL_HALF_ST_SPACE_ROUND(SPACE,  , __gen_ocl_f32to16) \
  DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, __gen_ocl_f32to16) \
  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, f32to16_rtz) \
  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, f32to16_rtp) \
  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, f32to16_rtn) \

DECL_HALF_LD_SPACE(__global)
DECL_HALF_LD_SPACE(__local)
DECL_HALF_LD_SPACE(__constant)
DECL_HALF_LD_SPACE(__private)

DECL_HALF_ST_SPACE(__global)
DECL_HALF_ST_SPACE(__local)
DECL_HALF_ST_SPACE(__private)

//#undef DECL_UNTYPED_RW_ALL_SPACE
#undef DECL_HALF_LD_SPACE
#undef DECL_HALF_ST_SPACE
#undef DECL_HALF_ST_SPACE_ROUND