diff options
| author | Yang Rong <rong.r.yang@intel.com> | 2016-10-19 10:35:46 +0800 |
|---|---|---|
| committer | Yang Rong <rong.r.yang@intel.com> | 2016-11-08 20:38:21 +0800 |
| commit | a606edb2357b3e3d16cc8b99445f4c5c5f3f6d0b (patch) | |
| tree | d4e5de88a26a59092f792918d0656ca5c5d5fd57 | |
| parent | 74f2113c479a3a85525e07ad5d410f83218dbadb (diff) | |
libocl: change prototype of vload/vstore to match ocl2.0 spec.
ocl2.0 spec only declare generic and constant address space vload,
and only generic space vstore. private/global/local version are
all removed.
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
Reviewed-by: Yang Rong <rong.r.yang@intel.com>
| -rw-r--r-- | backend/src/libocl/CMakeLists.txt | 56 | ||||
| -rw-r--r-- | backend/src/libocl/include/ocl.h | 4 | ||||
| -rw-r--r-- | backend/src/libocl/include/ocl_vload_20.h | 150 | ||||
| -rw-r--r-- | backend/src/libocl/src/ocl_vload_20.cl | 284 |
4 files changed, 482 insertions, 12 deletions
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt index 1d1ec680..ab5305b3 100644 --- a/backend/src/libocl/CMakeLists.txt +++ b/backend/src/libocl/CMakeLists.txt @@ -2,6 +2,8 @@ PROJECT(LIBOCL) SET (OCL_OBJECT_DIR ${LIBOCL_BINARY_DIR}/${BEIGNET_INSTALL_DIR}) SET (OCL_HEADER_FILES ${OCL_OBJECT_DIR}/include/ocl_defines.h) SET (OCL_SOURCE_FILES "") +SET (OCL_SOURCE_FILES_12 "") +SET (OCL_SOURCE_FILES_20 "") ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/include/ocl_defines.h COMMAND mkdir -p ${OCL_OBJECT_DIR}/include/ @@ -30,11 +32,11 @@ MACRO(COPY_THE_HEADER _mod) ) ENDIF(orgin_name STREQUAL output_name) ENDMACRO(COPY_THE_HEADER) -MACRO(COPY_THE_SOURCE _mod) +MACRO(COPY_THE_SOURCE _source _mod) # Use the python script to generate the header files. STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.cl" output_name ${_mod}) STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/src/\\1.cl" orgin_name ${_mod}) - SET(OCL_SOURCE_FILES ${OCL_SOURCE_FILES} ${output_name}) + SET(${_source} ${${_source}} ${output_name}) IF(orgin_name STREQUAL output_name) ELSE(orgin_name STREQUAL output_name) ADD_CUSTOM_COMMAND(OUTPUT ${output_name} @@ -50,14 +52,26 @@ ENDMACRO(COPY_THE_SOURCE) SET (OCL_COPY_HEADERS ocl ocl_types ocl_float ocl_printf) FOREACH(M ${OCL_COPY_HEADERS}) COPY_THE_HEADER(${M}) -ENDFOREACH(M) +ENDFOREACH(M) SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_memcpy - ocl_memset ocl_misc ocl_vload ocl_geometric ocl_image ocl_work_group) + ocl_memset ocl_misc ocl_geometric ocl_image ocl_work_group) FOREACH(M ${OCL_COPY_MODULES}) COPY_THE_HEADER(${M}) - COPY_THE_SOURCE(${M}) -ENDFOREACH(M) + COPY_THE_SOURCE(OCL_SOURCE_FILES ${M}) +ENDFOREACH(M) + +SET (OCL_COPY_MODULES_12 ocl_vload) +FOREACH(M ${OCL_COPY_MODULES_12}) + COPY_THE_HEADER(${M}) + COPY_THE_SOURCE(OCL_SOURCE_FILES_12 ${M}) +ENDFOREACH(M) + +SET (OCL_COPY_MODULES_20 ocl_vload_20) +FOREACH(M ${OCL_COPY_MODULES_20}) + COPY_THE_HEADER(${M}) + COPY_THE_SOURCE(OCL_SOURCE_FILES_20 ${M}) +ENDFOREACH(M) MACRO(GENERATE_HEADER_PY _mod) @@ -146,13 +160,21 @@ ENDMACRO(ADD_CL_TO_BC_TARGET) FOREACH(f ${OCL_SOURCE_FILES}) ADD_CL_TO_BC_TARGET(${f}) -ENDFOREACH(f) - -FOREACH(f ${OCL_SOURCE_FILES}) STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f}) SET(OCL_BC_FILES ${OCL_BC_FILES} ${bc_name}) -ENDFOREACH(f) +ENDFOREACH(f) +FOREACH(f ${OCL_SOURCE_FILES_12}) + ADD_CL_TO_BC_TARGET(${f}) + STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f}) + SET(OCL_BC_FILES_12 ${OCL_BC_FILES_12} ${bc_name}) +ENDFOREACH(f) + +FOREACH(f ${OCL_SOURCE_FILES_20}) + ADD_CL_TO_BC_TARGET(${f}) + STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f}) + SET(OCL_BC_FILES_20 ${OCL_BC_FILES_20} ${bc_name}) +ENDFOREACH(f) # handle the ll files MACRO(COPY_THE_LL _mod) @@ -194,11 +216,21 @@ ENDFOREACH(f) ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.bc COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/ #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES} - COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES} - DEPENDS ${OCL_BC_FILES} + COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES} ${OCL_BC_FILES_12} + DEPENDS ${OCL_BC_FILES} ${OCL_BC_FILES_12} COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet.bc" ) +if (ENABLE_OPENCL_20) +ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet_20.bc + COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/ + #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES} + COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet_20.bc ${OCL_BC_FILES} ${OCL_BC_FILES_20} + DEPENDS ${OCL_BC_FILES} ${OCL_BC_FILES_20} + COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet_20.bc" + ) +endif (ENABLE_OPENCL_20) + ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.local.pch COMMAND mkdir -p ${OCL_OBJECT_DIR} COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h index 6230b932..060e1e4e 100644 --- a/backend/src/libocl/include/ocl.h +++ b/backend/src/libocl/include/ocl.h @@ -97,7 +97,11 @@ #include "ocl_printf.h" #include "ocl_relational.h" #include "ocl_sync.h" +#if (__OPENCL_C_VERSION__ >= 200) +#include "ocl_vload_20.h" +#else #include "ocl_vload.h" +#endif #include "ocl_workitem.h" #include "ocl_simd.h" #include "ocl_work_group.h" diff --git a/backend/src/libocl/include/ocl_vload_20.h b/backend/src/libocl/include/ocl_vload_20.h new file mode 100644 index 00000000..636f94d9 --- /dev/null +++ b/backend/src/libocl/include/ocl_vload_20.h @@ -0,0 +1,150 @@ +/* + * Copyright © 2012 - 2014 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + * + */ +#ifndef __OCL_VLOAD_H__ +#define __OCL_VLOAD_H__ + +#include "ocl_types.h" + +///////////////////////////////////////////////////////////////////////////// +// Vector loads and stores +///////////////////////////////////////////////////////////////////////////// + +// These loads and stores will use untyped reads and writes, so we can just +// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue. +// Well we do not care, we do not activate TBAA in the compiler +#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \ +OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p); \ +OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p); + +#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \ +OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p); + +#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \ +OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p); \ +OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p); + +#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \ +OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p); + +#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \ + DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE) + +#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \ + DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \ + DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \ + DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \ + DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \ + DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE) + +#define DECL_UNTYPED_RW_ALL(TYPE) \ + DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \ + DECL_UNTYPED_RW_ALL_SPACE(TYPE, __generic) + +#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \ +OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p); \ +OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p); \ +OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p); \ +OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p); \ +OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p); + +#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \ +OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p); \ +OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p); \ +OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p); \ +OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p); \ +OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p); + +#define DECL_BYTE_RW_ALL(TYPE) \ + DECL_BYTE_RD_SPACE(TYPE, __generic) \ + DECL_BYTE_WR_SPACE(TYPE, __generic) \ + DECL_BYTE_RD_SPACE(TYPE, __constant) + +DECL_BYTE_RW_ALL(char) +DECL_BYTE_RW_ALL(uchar) +DECL_BYTE_RW_ALL(short) +DECL_BYTE_RW_ALL(ushort) +DECL_BYTE_RW_ALL(half) +DECL_UNTYPED_RW_ALL(int) +DECL_UNTYPED_RW_ALL(uint) +DECL_UNTYPED_RW_ALL(long) +DECL_UNTYPED_RW_ALL(ulong) +DECL_UNTYPED_RW_ALL(float) +DECL_UNTYPED_RW_ALL(double) + +#undef DECL_UNTYPED_RW_ALL +#undef DECL_UNTYPED_RW_ALL_SPACE +#undef DECL_UNTYPED_RD_ALL_SPACE +#undef DECL_UNTYPED_RW_SPACE_N +#undef DECL_UNTYPED_RD_SPACE_N +#undef DECL_UNTYPED_V3_SPACE +#undef DECL_UNTYPED_RDV3_SPACE +#undef DECL_BYTE_RD_SPACE +#undef DECL_BYTE_WR_SPACE +#undef DECL_BYTE_RW_ALL + + +#define DECL_HALF_LD_SPACE(SPACE) \ +OVERLOADABLE float vload_half(size_t offset, const SPACE half *p); \ +OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p); \ +OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p); \ +OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p); \ +OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p); \ +OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p); \ +OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p); \ +OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p); \ +OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p); \ +OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p); \ +OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p); \ +OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p); \ + +#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \ +OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p); \ +OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p); + +#define DECL_HALF_ST_SPACE(SPACE) \ + DECL_HALF_ST_SPACE_ROUND(SPACE, , dummy) \ + DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, dummy) \ + DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, dummy) \ + DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, dummy) \ + DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, dummy) \ + +DECL_HALF_LD_SPACE(__constant) +DECL_HALF_LD_SPACE(__generic) + +DECL_HALF_ST_SPACE(__generic) + +//#undef DECL_UNTYPED_RW_ALL_SPACE +#undef DECL_HALF_LD_SPACE +#undef DECL_HALF_ST_SPACE +#undef DECL_HALF_ST_SPACE_ROUND + +#endif /* __OCL_VLOAD_H__ */ diff --git a/backend/src/libocl/src/ocl_vload_20.cl b/backend/src/libocl/src/ocl_vload_20.cl new file mode 100644 index 00000000..a629f132 --- /dev/null +++ b/backend/src/libocl/src/ocl_vload_20.cl @@ -0,0 +1,284 @@ +/* + * Copyright © 2012 - 2014 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + * + */ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#include "ocl_vload20.h" +#include "ocl_relational.h" + +// These loads and stores will use untyped reads and writes, so we can just +// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue. +// Well we do not care, we do not activate TBAA in the compiler +#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \ +OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \ + return *(SPACE TYPE##DIM *) (p + DIM * offset); \ +} \ +OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \ + *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \ +} + +#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \ +OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \ + return *(SPACE TYPE##DIM *) (p + DIM * offset); \ +} + +#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \ +OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\ + *(p + 3 * offset) = v.s0; \ + *(p + 3 * offset + 1) = v.s1; \ + *(p + 3 * offset + 2) = v.s2; \ +} \ +OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\ +} + +#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \ +OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\ +} + +#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \ + DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \ + DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE) + +#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \ + DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \ + DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \ + DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \ + DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \ + DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE) + +#define DECL_UNTYPED_RW_ALL(TYPE) \ + DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \ + DECL_UNTYPED_RW_ALL_SPACE(TYPE, __generic) + +#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \ +OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \ +} \ +OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \ +} \ +OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \ +} \ +OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \ +} \ +OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \ +} + +#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \ +OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\ + *(p + 2 * offset) = v.s0; \ + *(p + 2 * offset + 1) = v.s1; \ +} \ +OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\ + *(p + 3 * offset) = v.s0; \ + *(p + 3 * offset + 1) = v.s1; \ + *(p + 3 * offset + 2) = v.s2; \ +} \ +OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \ + vstore2(v.lo, 2*offset, p); \ + vstore2(v.hi, 2*offset, p+2); \ +} \ +OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \ + vstore4(v.lo, 2*offset, p); \ + vstore4(v.hi, 2*offset, p+4); \ +} \ +OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \ + vstore8(v.lo, 2*offset, p); \ + vstore8(v.hi, 2*offset, p+8); \ +} + +#define DECL_BYTE_RW_ALL(TYPE) \ + DECL_BYTE_RD_SPACE(TYPE, __generic) \ + DECL_BYTE_RD_SPACE(TYPE, __constant) \ + DECL_BYTE_WR_SPACE(TYPE, __generic) + +DECL_BYTE_RW_ALL(char) +DECL_BYTE_RW_ALL(half) +DECL_BYTE_RW_ALL(uchar) +DECL_BYTE_RW_ALL(short) +DECL_BYTE_RW_ALL(ushort) +DECL_UNTYPED_RW_ALL(int) +DECL_UNTYPED_RW_ALL(uint) +DECL_UNTYPED_RW_ALL(long) +DECL_UNTYPED_RW_ALL(ulong) +DECL_UNTYPED_RW_ALL(float) +DECL_UNTYPED_RW_ALL(double) + +#undef DECL_UNTYPED_RW_ALL +#undef DECL_UNTYPED_RW_ALL_SPACE +#undef DECL_UNTYPED_RD_ALL_SPACE +#undef DECL_UNTYPED_RW_SPACE_N +#undef DECL_UNTYPED_RD_SPACE_N +#undef DECL_UNTYPED_V3_SPACE +#undef DECL_UNTYPED_RDV3_SPACE +#undef DECL_BYTE_RD_SPACE +#undef DECL_BYTE_WR_SPACE +#undef DECL_BYTE_RW_ALL + +PURE CONST float __gen_ocl_f16to32(short h); +PURE CONST short __gen_ocl_f32to16(float f); + +OVERLOADABLE short f32to16_rtp(float f) { + short s = __gen_ocl_f32to16(f); + float con = __gen_ocl_f16to32(s); + //if(isinf(con)) return s; + if (f > con) + return s - signbit(f) * 2 + 1; + else + return s; +} + +OVERLOADABLE short f32to16_rtn(float f) { + short s = __gen_ocl_f32to16(f); + float con = __gen_ocl_f16to32(s); + //if(isinf(con)) return s; + if (con > f) + return s + signbit(f) * 2 - 1; + else + return s; +} + +OVERLOADABLE short f32to16_rtz(float f) { + short s = __gen_ocl_f32to16(f); + float con = __gen_ocl_f16to32(s); + //if(isinf(con)) return s; + if (((con > f) && !signbit(f)) || + ((con < f) && signbit(f))) + return s - 1; + else + return s; +} + +#define DECL_HALF_LD_SPACE(SPACE) \ +OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \ + return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \ +} \ +OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p) { \ + return vload_half(offset, p); \ +} \ +OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \ + return (float2)(vload_half(offset*2, p), \ + vload_half(offset*2 + 1, p)); \ +} \ +OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p) { \ + return (float2)(vloada_half(offset*2, p), \ + vloada_half(offset*2 + 1, p)); \ +} \ +OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \ + return (float3)(vload_half(offset*3, p), \ + vload_half(offset*3 + 1, p), \ + vload_half(offset*3 + 2, p)); \ +} \ +OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \ + return (float3)(vload_half(offset*4, p), \ + vload_half(offset*4 + 1, p), \ + vload_half(offset*4 + 2, p)); \ +} \ +OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \ + return (float4)(vload_half2(offset*2, p), \ + vload_half2(offset*2 + 1, p)); \ +} \ +OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p) { \ + return (float4)(vloada_half2(offset*2, p), \ + vloada_half2(offset*2 + 1, p)); \ +} \ +OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \ + return (float8)(vload_half4(offset*2, p), \ + vload_half4(offset*2 + 1, p)); \ +} \ +OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p) { \ + return (float8)(vloada_half4(offset*2, p), \ + vloada_half4(offset*2 + 1, p)); \ +} \ +OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \ + return (float16)(vload_half8(offset*2, p), \ + vload_half8(offset*2 + 1, p)); \ +}\ +OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p) { \ + return (float16)(vloada_half8(offset*2, p), \ + vloada_half8(offset*2 + 1, p)); \ +}\ + +#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \ +OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \ + *(SPACE short *)(p + offset) = FUNC(data); \ +} \ +OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p) { \ + vstore_half##ROUND(data, offset, p); \ +} \ +OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \ + vstore_half##ROUND(data.lo, offset*2, p); \ + vstore_half##ROUND(data.hi, offset*2 + 1, p); \ +} \ +OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \ + vstore_half2##ROUND(data, offset, p); \ +} \ +OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \ + vstore_half##ROUND(data.s0, offset*3, p); \ + vstore_half##ROUND(data.s1, offset*3 + 1, p); \ + vstore_half##ROUND(data.s2, offset*3 + 2, p); \ +} \ +OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \ + vstore_half##ROUND(data.s0, offset*4, p); \ + vstore_half##ROUND(data.s1, offset*4 + 1, p); \ + vstore_half##ROUND(data.s2, offset*4 + 2, p); \ +} \ +OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \ + vstore_half2##ROUND(data.lo, offset*2, p); \ + vstore_half2##ROUND(data.hi, offset*2 + 1, p); \ +} \ +OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \ + vstore_half4##ROUND(data, offset, p); \ +} \ +OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \ + vstore_half4##ROUND(data.lo, offset*2, p); \ + vstore_half4##ROUND(data.hi, offset*2 + 1, p); \ +} \ +OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \ + vstore_half8##ROUND(data, offset, p); \ +} \ +OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \ + vstore_half8##ROUND(data.lo, offset*2, p); \ + vstore_half8##ROUND(data.hi, offset*2 + 1, p); \ +} \ +OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \ + vstore_half16##ROUND(data, offset, p); \ +} + +#define DECL_HALF_ST_SPACE(SPACE) \ + DECL_HALF_ST_SPACE_ROUND(SPACE, , __gen_ocl_f32to16) \ + DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, __gen_ocl_f32to16) \ + DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, f32to16_rtz) \ + DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, f32to16_rtp) \ + DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, f32to16_rtn) \ + +DECL_HALF_LD_SPACE(__constant) +DECL_HALF_LD_SPACE(__generic) + +DECL_HALF_ST_SPACE(__generic) + +//#undef DECL_UNTYPED_RW_ALL_SPACE +#undef DECL_HALF_LD_SPACE +#undef DECL_HALF_ST_SPACE +#undef DECL_HALF_ST_SPACE_ROUND |
