summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYang Rong <rong.r.yang@intel.com>2016-10-19 10:35:46 +0800
committerYang Rong <rong.r.yang@intel.com>2016-11-08 20:38:21 +0800
commita606edb2357b3e3d16cc8b99445f4c5c5f3f6d0b (patch)
treed4e5de88a26a59092f792918d0656ca5c5d5fd57
parent74f2113c479a3a85525e07ad5d410f83218dbadb (diff)
libocl: change prototype of vload/vstore to match ocl2.0 spec.
ocl2.0 spec only declare generic and constant address space vload, and only generic space vstore. private/global/local version are all removed. Signed-off-by: Ruiling Song <ruiling.song@intel.com> Reviewed-by: Yang Rong <rong.r.yang@intel.com>
-rw-r--r--backend/src/libocl/CMakeLists.txt56
-rw-r--r--backend/src/libocl/include/ocl.h4
-rw-r--r--backend/src/libocl/include/ocl_vload_20.h150
-rw-r--r--backend/src/libocl/src/ocl_vload_20.cl284
4 files changed, 482 insertions, 12 deletions
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 1d1ec680..ab5305b3 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -2,6 +2,8 @@ PROJECT(LIBOCL)
SET (OCL_OBJECT_DIR ${LIBOCL_BINARY_DIR}/${BEIGNET_INSTALL_DIR})
SET (OCL_HEADER_FILES ${OCL_OBJECT_DIR}/include/ocl_defines.h)
SET (OCL_SOURCE_FILES "")
+SET (OCL_SOURCE_FILES_12 "")
+SET (OCL_SOURCE_FILES_20 "")
ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/include/ocl_defines.h
COMMAND mkdir -p ${OCL_OBJECT_DIR}/include/
@@ -30,11 +32,11 @@ MACRO(COPY_THE_HEADER _mod)
)
ENDIF(orgin_name STREQUAL output_name)
ENDMACRO(COPY_THE_HEADER)
-MACRO(COPY_THE_SOURCE _mod)
+MACRO(COPY_THE_SOURCE _source _mod)
# Use the python script to generate the header files.
STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.cl" output_name ${_mod})
STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/src/\\1.cl" orgin_name ${_mod})
- SET(OCL_SOURCE_FILES ${OCL_SOURCE_FILES} ${output_name})
+ SET(${_source} ${${_source}} ${output_name})
IF(orgin_name STREQUAL output_name)
ELSE(orgin_name STREQUAL output_name)
ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
@@ -50,14 +52,26 @@ ENDMACRO(COPY_THE_SOURCE)
SET (OCL_COPY_HEADERS ocl ocl_types ocl_float ocl_printf)
FOREACH(M ${OCL_COPY_HEADERS})
COPY_THE_HEADER(${M})
-ENDFOREACH(M)
+ENDFOREACH(M)
SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_memcpy
- ocl_memset ocl_misc ocl_vload ocl_geometric ocl_image ocl_work_group)
+ ocl_memset ocl_misc ocl_geometric ocl_image ocl_work_group)
FOREACH(M ${OCL_COPY_MODULES})
COPY_THE_HEADER(${M})
- COPY_THE_SOURCE(${M})
-ENDFOREACH(M)
+ COPY_THE_SOURCE(OCL_SOURCE_FILES ${M})
+ENDFOREACH(M)
+
+SET (OCL_COPY_MODULES_12 ocl_vload)
+FOREACH(M ${OCL_COPY_MODULES_12})
+ COPY_THE_HEADER(${M})
+ COPY_THE_SOURCE(OCL_SOURCE_FILES_12 ${M})
+ENDFOREACH(M)
+
+SET (OCL_COPY_MODULES_20 ocl_vload_20)
+FOREACH(M ${OCL_COPY_MODULES_20})
+ COPY_THE_HEADER(${M})
+ COPY_THE_SOURCE(OCL_SOURCE_FILES_20 ${M})
+ENDFOREACH(M)
MACRO(GENERATE_HEADER_PY _mod)
@@ -146,13 +160,21 @@ ENDMACRO(ADD_CL_TO_BC_TARGET)
FOREACH(f ${OCL_SOURCE_FILES})
ADD_CL_TO_BC_TARGET(${f})
-ENDFOREACH(f)
-
-FOREACH(f ${OCL_SOURCE_FILES})
STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
SET(OCL_BC_FILES ${OCL_BC_FILES} ${bc_name})
-ENDFOREACH(f)
+ENDFOREACH(f)
+FOREACH(f ${OCL_SOURCE_FILES_12})
+ ADD_CL_TO_BC_TARGET(${f})
+ STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
+ SET(OCL_BC_FILES_12 ${OCL_BC_FILES_12} ${bc_name})
+ENDFOREACH(f)
+
+FOREACH(f ${OCL_SOURCE_FILES_20})
+ ADD_CL_TO_BC_TARGET(${f})
+ STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
+ SET(OCL_BC_FILES_20 ${OCL_BC_FILES_20} ${bc_name})
+ENDFOREACH(f)
# handle the ll files
MACRO(COPY_THE_LL _mod)
@@ -194,11 +216,21 @@ ENDFOREACH(f)
ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.bc
COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/
#COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES}
- COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES}
- DEPENDS ${OCL_BC_FILES}
+ COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES} ${OCL_BC_FILES_12}
+ DEPENDS ${OCL_BC_FILES} ${OCL_BC_FILES_12}
COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet.bc"
)
+if (ENABLE_OPENCL_20)
+ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet_20.bc
+ COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/
+ #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES}
+ COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet_20.bc ${OCL_BC_FILES} ${OCL_BC_FILES_20}
+ DEPENDS ${OCL_BC_FILES} ${OCL_BC_FILES_20}
+ COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet_20.bc"
+ )
+endif (ENABLE_OPENCL_20)
+
ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.local.pch
COMMAND mkdir -p ${OCL_OBJECT_DIR}
COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index 6230b932..060e1e4e 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -97,7 +97,11 @@
#include "ocl_printf.h"
#include "ocl_relational.h"
#include "ocl_sync.h"
+#if (__OPENCL_C_VERSION__ >= 200)
+#include "ocl_vload_20.h"
+#else
#include "ocl_vload.h"
+#endif
#include "ocl_workitem.h"
#include "ocl_simd.h"
#include "ocl_work_group.h"
diff --git a/backend/src/libocl/include/ocl_vload_20.h b/backend/src/libocl/include/ocl_vload_20.h
new file mode 100644
index 00000000..636f94d9
--- /dev/null
+++ b/backend/src/libocl/include/ocl_vload_20.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_VLOAD_H__
+#define __OCL_VLOAD_H__
+
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// Vector loads and stores
+/////////////////////////////////////////////////////////////////////////////
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p);
+
+#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p);
+
+#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p);
+
+#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p);
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+ DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
+ DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+ DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
+ DECL_UNTYPED_RW_ALL_SPACE(TYPE, __generic)
+
+#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p);
+
+#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p);
+
+#define DECL_BYTE_RW_ALL(TYPE) \
+ DECL_BYTE_RD_SPACE(TYPE, __generic) \
+ DECL_BYTE_WR_SPACE(TYPE, __generic) \
+ DECL_BYTE_RD_SPACE(TYPE, __constant)
+
+DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(uchar)
+DECL_BYTE_RW_ALL(short)
+DECL_BYTE_RW_ALL(ushort)
+DECL_BYTE_RW_ALL(half)
+DECL_UNTYPED_RW_ALL(int)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(long)
+DECL_UNTYPED_RW_ALL(ulong)
+DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(double)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RD_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+#undef DECL_UNTYPED_RD_SPACE_N
+#undef DECL_UNTYPED_V3_SPACE
+#undef DECL_UNTYPED_RDV3_SPACE
+#undef DECL_BYTE_RD_SPACE
+#undef DECL_BYTE_WR_SPACE
+#undef DECL_BYTE_RW_ALL
+
+
+#define DECL_HALF_LD_SPACE(SPACE) \
+OVERLOADABLE float vload_half(size_t offset, const SPACE half *p); \
+OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p); \
+OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p); \
+OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p); \
+OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p); \
+OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p); \
+OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p); \
+OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p); \
+OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p); \
+OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p); \
+OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p); \
+OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p); \
+
+#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
+OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p);
+
+#define DECL_HALF_ST_SPACE(SPACE) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, , dummy) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, dummy) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, dummy) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, dummy) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, dummy) \
+
+DECL_HALF_LD_SPACE(__constant)
+DECL_HALF_LD_SPACE(__generic)
+
+DECL_HALF_ST_SPACE(__generic)
+
+//#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_HALF_LD_SPACE
+#undef DECL_HALF_ST_SPACE
+#undef DECL_HALF_ST_SPACE_ROUND
+
+#endif /* __OCL_VLOAD_H__ */
diff --git a/backend/src/libocl/src/ocl_vload_20.cl b/backend/src/libocl/src/ocl_vload_20.cl
new file mode 100644
index 00000000..a629f132
--- /dev/null
+++ b/backend/src/libocl/src/ocl_vload_20.cl
@@ -0,0 +1,284 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#include "ocl_vload20.h"
+#include "ocl_relational.h"
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+ return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+} \
+OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+ *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
+}
+
+#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+ return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+}
+
+#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 3 * offset) = v.s0; \
+ *(p + 3 * offset + 1) = v.s1; \
+ *(p + 3 * offset + 2) = v.s2; \
+} \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+}
+
+#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+}
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+ DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
+ DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+ DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
+ DECL_UNTYPED_RW_ALL_SPACE(TYPE, __generic)
+
+#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
+} \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
+} \
+OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
+} \
+OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
+} \
+OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
+}
+
+#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 2 * offset) = v.s0; \
+ *(p + 2 * offset + 1) = v.s1; \
+} \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 3 * offset) = v.s0; \
+ *(p + 3 * offset + 1) = v.s1; \
+ *(p + 3 * offset + 2) = v.s2; \
+} \
+OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
+ vstore2(v.lo, 2*offset, p); \
+ vstore2(v.hi, 2*offset, p+2); \
+} \
+OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
+ vstore4(v.lo, 2*offset, p); \
+ vstore4(v.hi, 2*offset, p+4); \
+} \
+OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
+ vstore8(v.lo, 2*offset, p); \
+ vstore8(v.hi, 2*offset, p+8); \
+}
+
+#define DECL_BYTE_RW_ALL(TYPE) \
+ DECL_BYTE_RD_SPACE(TYPE, __generic) \
+ DECL_BYTE_RD_SPACE(TYPE, __constant) \
+ DECL_BYTE_WR_SPACE(TYPE, __generic)
+
+DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(half)
+DECL_BYTE_RW_ALL(uchar)
+DECL_BYTE_RW_ALL(short)
+DECL_BYTE_RW_ALL(ushort)
+DECL_UNTYPED_RW_ALL(int)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(long)
+DECL_UNTYPED_RW_ALL(ulong)
+DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(double)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RD_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+#undef DECL_UNTYPED_RD_SPACE_N
+#undef DECL_UNTYPED_V3_SPACE
+#undef DECL_UNTYPED_RDV3_SPACE
+#undef DECL_BYTE_RD_SPACE
+#undef DECL_BYTE_WR_SPACE
+#undef DECL_BYTE_RW_ALL
+
+PURE CONST float __gen_ocl_f16to32(short h);
+PURE CONST short __gen_ocl_f32to16(float f);
+
+OVERLOADABLE short f32to16_rtp(float f) {
+ short s = __gen_ocl_f32to16(f);
+ float con = __gen_ocl_f16to32(s);
+ //if(isinf(con)) return s;
+ if (f > con)
+ return s - signbit(f) * 2 + 1;
+ else
+ return s;
+}
+
+OVERLOADABLE short f32to16_rtn(float f) {
+ short s = __gen_ocl_f32to16(f);
+ float con = __gen_ocl_f16to32(s);
+ //if(isinf(con)) return s;
+ if (con > f)
+ return s + signbit(f) * 2 - 1;
+ else
+ return s;
+}
+
+OVERLOADABLE short f32to16_rtz(float f) {
+ short s = __gen_ocl_f32to16(f);
+ float con = __gen_ocl_f16to32(s);
+ //if(isinf(con)) return s;
+ if (((con > f) && !signbit(f)) ||
+ ((con < f) && signbit(f)))
+ return s - 1;
+ else
+ return s;
+}
+
+#define DECL_HALF_LD_SPACE(SPACE) \
+OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
+ return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
+} \
+OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p) { \
+ return vload_half(offset, p); \
+} \
+OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
+ return (float2)(vload_half(offset*2, p), \
+ vload_half(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p) { \
+ return (float2)(vloada_half(offset*2, p), \
+ vloada_half(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
+ return (float3)(vload_half(offset*3, p), \
+ vload_half(offset*3 + 1, p), \
+ vload_half(offset*3 + 2, p)); \
+} \
+OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \
+ return (float3)(vload_half(offset*4, p), \
+ vload_half(offset*4 + 1, p), \
+ vload_half(offset*4 + 2, p)); \
+} \
+OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
+ return (float4)(vload_half2(offset*2, p), \
+ vload_half2(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p) { \
+ return (float4)(vloada_half2(offset*2, p), \
+ vloada_half2(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
+ return (float8)(vload_half4(offset*2, p), \
+ vload_half4(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p) { \
+ return (float8)(vloada_half4(offset*2, p), \
+ vloada_half4(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \
+ return (float16)(vload_half8(offset*2, p), \
+ vload_half8(offset*2 + 1, p)); \
+}\
+OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p) { \
+ return (float16)(vloada_half8(offset*2, p), \
+ vloada_half8(offset*2 + 1, p)); \
+}\
+
+#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
+OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \
+ *(SPACE short *)(p + offset) = FUNC(data); \
+} \
+OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data.lo, offset*2, p); \
+ vstore_half##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+ vstore_half2##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data.s0, offset*3, p); \
+ vstore_half##ROUND(data.s1, offset*3 + 1, p); \
+ vstore_half##ROUND(data.s2, offset*3 + 2, p); \
+} \
+OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data.s0, offset*4, p); \
+ vstore_half##ROUND(data.s1, offset*4 + 1, p); \
+ vstore_half##ROUND(data.s2, offset*4 + 2, p); \
+} \
+OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+ vstore_half2##ROUND(data.lo, offset*2, p); \
+ vstore_half2##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+ vstore_half4##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+ vstore_half4##ROUND(data.lo, offset*2, p); \
+ vstore_half4##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+ vstore_half8##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+ vstore_half8##ROUND(data.lo, offset*2, p); \
+ vstore_half8##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+ vstore_half16##ROUND(data, offset, p); \
+}
+
+#define DECL_HALF_ST_SPACE(SPACE) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, , __gen_ocl_f32to16) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, __gen_ocl_f32to16) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, f32to16_rtz) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, f32to16_rtp) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, f32to16_rtn) \
+
+DECL_HALF_LD_SPACE(__constant)
+DECL_HALF_LD_SPACE(__generic)
+
+DECL_HALF_ST_SPACE(__generic)
+
+//#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_HALF_LD_SPACE
+#undef DECL_HALF_ST_SPACE
+#undef DECL_HALF_ST_SPACE_ROUND