diff options
27 files changed, 406 insertions, 312 deletions
diff --git a/LICENSE.TXT b/LICENSE.TXT index e4df97b..30aab2f 100644 --- a/LICENSE.TXT +++ b/LICENSE.TXT @@ -11,7 +11,7 @@ Full text of the relevant licenses is included below. ============================================================================== -Copyright (c) 2011-2012 by the contributors listed in CREDITS.TXT +Copyright (c) 2011-2013 by the contributors listed in CREDITS.TXT All rights reserved. @@ -43,7 +43,7 @@ SOFTWARE. ============================================================================== -Copyright (c) 2011-2012 by the contributors listed in CREDITS.TXT +Copyright (c) 2011-2013 by the contributors listed in CREDITS.TXT Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/configure.py b/configure.py index 347f3b3..79cc4df 100755 --- a/configure.py +++ b/configure.py @@ -61,7 +61,8 @@ def llvm_config(args): sys.exit(1) llvm_bindir = llvm_config(['--bindir']) -llvm_core_libs = llvm_config(['--ldflags', '--libs', 'core', 'bitreader', 'bitwriter']) +llvm_core_libs = llvm_config(['--libs', 'core', 'bitreader', 'bitwriter']) + ' ' + \ + llvm_config(['--ldflags']) llvm_cxxflags = llvm_config(['--cxxflags']) + ' -fno-exceptions -fno-rtti' llvm_clang = os.path.join(llvm_bindir, 'clang') @@ -74,10 +75,12 @@ available_targets = { {'gpu' : 'cypress', 'aliases' : ['hemlock']}, {'gpu' : 'barts', 'aliases' : ['turks', 'caicos']}, {'gpu' : 'cayman', 'aliases' : ['aruba']}, - {'gpu' : 'tahiti', 'aliases' : ['pitcairn', 'verde', 'oland']}]} + {'gpu' : 'tahiti', 'aliases' : ['pitcairn', 'verde', 'oland']}]}, + 'nvptx--nvidiacl' : { 'devices' : [{'gpu' : '', 'aliases' : []}] }, + 'nvptx64--nvidiacl' : { 'devices' : [{'gpu' : '', 'aliases' : []}] } } -default_targets = ['r600--'] +default_targets = ['nvptx--nvidiacl', 'nvptx64--nvidiacl', 'r600--'] targets = args if not targets: @@ -170,12 +173,8 @@ for target in targets: for src in open(subdir_list_file).readlines(): src = src.rstrip() - # Only add the base filename (e.g. Add get_global_id instead of - # get_global_id.cl) to sources_seen. - # This allows targets to overide generic .cl sources with .ll sources. - src_base = os.path.splitext(src)[0] - if src_base not in sources_seen: - sources_seen.add(src_base) + if src not in sources_seen: + sources_seen.add(src) obj = os.path.join(target, 'lib', src + obj_suffix + '.bc') objects.append(obj) src_file = os.path.join(libdir, src) diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h index 10d30e0..9a2f443 100644 --- a/generic/include/clc/clc.h +++ b/generic/include/clc/clc.h @@ -64,8 +64,11 @@ #include <clc/integer/abs_diff.h> #include <clc/integer/add_sat.h> #include <clc/integer/clz.h> +#include <clc/integer/mad24.h> +#include <clc/integer/mul24.h> #include <clc/integer/rotate.h> #include <clc/integer/sub_sat.h> +#include <clc/integer/upsample.h> /* 6.11.2 and 6.11.3 Shared Integer/Math Functions */ #include <clc/shared/clamp.h> @@ -82,6 +85,7 @@ /* 6.11.6 Relational Functions */ #include <clc/relational/any.h> +#include <clc/relational/bitselect.h> #include <clc/relational/select.h> /* 6.11.8 Synchronization Functions */ diff --git a/generic/include/clc/integer/integer-gentype.inc b/generic/include/clc/integer/integer-gentype.inc new file mode 100644 index 0000000..6470eb3 --- /dev/null +++ b/generic/include/clc/integer/integer-gentype.inc @@ -0,0 +1,39 @@ +#define __CLC_GENTYPE int +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE int2 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE int4 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE int8 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE int16 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE uint +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE uint2 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE uint4 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE uint8 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE uint16 +#include __CLC_BODY +#undef __CLC_GENTYPE diff --git a/generic/include/clc/integer/mad24.h b/generic/include/clc/integer/mad24.h new file mode 100644 index 0000000..0c120fa --- /dev/null +++ b/generic/include/clc/integer/mad24.h @@ -0,0 +1,3 @@ +#define __CLC_BODY <clc/integer/mad24.inc> +#include <clc/integer/integer-gentype.inc> +#undef __CLC_BODY diff --git a/generic/include/clc/integer/mad24.inc b/generic/include/clc/integer/mad24.inc new file mode 100644 index 0000000..81fe0c2 --- /dev/null +++ b/generic/include/clc/integer/mad24.inc @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mad24(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z); diff --git a/generic/include/clc/integer/mul24.h b/generic/include/clc/integer/mul24.h new file mode 100644 index 0000000..4f97098 --- /dev/null +++ b/generic/include/clc/integer/mul24.h @@ -0,0 +1,3 @@ +#define __CLC_BODY <clc/integer/mul24.inc> +#include <clc/integer/integer-gentype.inc> +#undef __CLC_BODY diff --git a/generic/include/clc/integer/mul24.inc b/generic/include/clc/integer/mul24.inc new file mode 100644 index 0000000..8cbf7c1 --- /dev/null +++ b/generic/include/clc/integer/mul24.inc @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mul24(__CLC_GENTYPE x, __CLC_GENTYPE y); diff --git a/generic/include/clc/integer/upsample.h b/generic/include/clc/integer/upsample.h new file mode 100644 index 0000000..127debf --- /dev/null +++ b/generic/include/clc/integer/upsample.h @@ -0,0 +1,25 @@ +#define __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE) \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE upsample(GENTYPE hi, UGENTYPE lo); + +#define __CLC_UPSAMPLE_VEC(BGENTYPE, GENTYPE, UGENTYPE) \ + __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE); \ + __CLC_UPSAMPLE_DECL(BGENTYPE##2, GENTYPE##2, UGENTYPE##2); \ + __CLC_UPSAMPLE_DECL(BGENTYPE##3, GENTYPE##3, UGENTYPE##3); \ + __CLC_UPSAMPLE_DECL(BGENTYPE##4, GENTYPE##4, UGENTYPE##4); \ + __CLC_UPSAMPLE_DECL(BGENTYPE##8, GENTYPE##8, UGENTYPE##8); \ + __CLC_UPSAMPLE_DECL(BGENTYPE##16, GENTYPE##16, UGENTYPE##16); \ + +#define __CLC_UPSAMPLE_TYPES() \ + __CLC_UPSAMPLE_VEC(short, char, uchar) \ + __CLC_UPSAMPLE_VEC(ushort, uchar, uchar) \ + __CLC_UPSAMPLE_VEC(int, short, ushort) \ + __CLC_UPSAMPLE_VEC(uint, ushort, ushort) \ + __CLC_UPSAMPLE_VEC(long, int, uint) \ + __CLC_UPSAMPLE_VEC(ulong, uint, uint) \ + +__CLC_UPSAMPLE_TYPES() + +#undef __CLC_UPSAMPLE_TYPES +#undef __CLC_UPSAMPLE_DECL +#undef __CLC_UPSAMPLE_VEC + diff --git a/generic/include/clc/relational/bitselect.h b/generic/include/clc/relational/bitselect.h new file mode 100644 index 0000000..e91cbfd --- /dev/null +++ b/generic/include/clc/relational/bitselect.h @@ -0,0 +1 @@ +#define bitselect(x, y, z) ((x) ^ ((z) & ((y) ^ (x)))) diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES index 8cda14a..9ac08bd 100644 --- a/generic/lib/SOURCES +++ b/generic/lib/SOURCES @@ -11,10 +11,13 @@ integer/add_sat_impl.ll integer/clz.cl integer/clz_if.ll integer/clz_impl.ll +integer/mad24.cl +integer/mul24.cl integer/rotate.cl integer/sub_sat.cl integer/sub_sat_if.ll integer/sub_sat_impl.ll +integer/upsample.cl math/fmax.cl math/fmin.cl math/hypot.cl @@ -24,10 +27,8 @@ shared/clamp.cl shared/max.cl shared/min.cl shared/vload.cl -shared/vload_if.ll shared/vload_impl.ll shared/vstore.cl -shared/vstore_if.ll shared/vstore_impl.ll workitem/get_global_id.cl workitem/get_global_size.cl diff --git a/generic/lib/integer/mad24.cl b/generic/lib/integer/mad24.cl new file mode 100644 index 0000000..e29e99f --- /dev/null +++ b/generic/lib/integer/mad24.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <mad24.inc> +#include <clc/integer/integer-gentype.inc> diff --git a/generic/lib/integer/mad24.inc b/generic/lib/integer/mad24.inc new file mode 100644 index 0000000..902b0aa --- /dev/null +++ b/generic/lib/integer/mad24.inc @@ -0,0 +1,3 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad24(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z){ + return mul24(x, y) + z; +} diff --git a/generic/lib/integer/mul24.cl b/generic/lib/integer/mul24.cl new file mode 100644 index 0000000..8aedca6 --- /dev/null +++ b/generic/lib/integer/mul24.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <mul24.inc> +#include <clc/integer/integer-gentype.inc> diff --git a/generic/lib/integer/mul24.inc b/generic/lib/integer/mul24.inc new file mode 100644 index 0000000..95a2f1d --- /dev/null +++ b/generic/lib/integer/mul24.inc @@ -0,0 +1,11 @@ + +// We need to use shifts here in order to mantain the sign bit for signed +// integers. The compiler should optimize this to (x & 0x00FFFFFF) for +// unsigned integers. +#define CONVERT_TO_24BIT(x) (((x) << 8) >> 8) + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mul24(__CLC_GENTYPE x, __CLC_GENTYPE y){ + return CONVERT_TO_24BIT(x) * CONVERT_TO_24BIT(y); +} + +#undef CONVERT_TO_24BIT diff --git a/generic/lib/integer/upsample.cl b/generic/lib/integer/upsample.cl new file mode 100644 index 0000000..7301cc3 --- /dev/null +++ b/generic/lib/integer/upsample.cl @@ -0,0 +1,34 @@ +#include <clc/clc.h> + +#define __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE) \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE upsample(GENTYPE hi, UGENTYPE lo){ \ + return ((BGENTYPE)hi << GENSIZE) | lo; \ + } \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE##2 upsample(GENTYPE##2 hi, UGENTYPE##2 lo){ \ + return (BGENTYPE##2){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1)}; \ + } \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE##3 upsample(GENTYPE##3 hi, UGENTYPE##3 lo){ \ + return (BGENTYPE##3){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1), upsample(hi.s2, lo.s2)}; \ + } \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE##4 upsample(GENTYPE##4 hi, UGENTYPE##4 lo){ \ + return (BGENTYPE##4){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \ + } \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE##8 upsample(GENTYPE##8 hi, UGENTYPE##8 lo){ \ + return (BGENTYPE##8){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \ + } \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE##16 upsample(GENTYPE##16 hi, UGENTYPE##16 lo){ \ + return (BGENTYPE##16){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \ + } \ + +#define __CLC_UPSAMPLE_TYPES() \ + __CLC_UPSAMPLE_IMPL(short, char, uchar, 8) \ + __CLC_UPSAMPLE_IMPL(ushort, uchar, uchar, 8) \ + __CLC_UPSAMPLE_IMPL(int, short, ushort, 16) \ + __CLC_UPSAMPLE_IMPL(uint, ushort, ushort, 16) \ + __CLC_UPSAMPLE_IMPL(long, int, uint, 32) \ + __CLC_UPSAMPLE_IMPL(ulong, uint, uint, 32) \ + +__CLC_UPSAMPLE_TYPES() + +#undef __CLC_UPSAMPLE_TYPES +#undef __CLC_UPSAMPLE_IMPL diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl index 4dd7918..6793072 100644 --- a/generic/lib/shared/vload.cl +++ b/generic/lib/shared/vload.cl @@ -2,23 +2,23 @@ #define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return (PRIM_TYPE##2)(x[offset] , x[offset+1]); \ + return (PRIM_TYPE##2)(x[2*offset] , x[2*offset+1]); \ } \ \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return (PRIM_TYPE##3)(x[offset] , x[offset+1], x[offset+2]); \ + return (PRIM_TYPE##3)(x[3*offset] , x[3*offset+1], x[3*offset+2]); \ } \ \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return (PRIM_TYPE##4)(x[offset], x[offset+1], x[offset+2], x[offset+3]); \ + return (PRIM_TYPE##4)(x[4*offset], x[4*offset+1], x[4*offset+2], x[4*offset+3]); \ } \ \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return (PRIM_TYPE##8)(vload4(offset, x), vload4(offset+4, x)); \ + return (PRIM_TYPE##8)(vload4(0, &x[8*offset]), vload4(1, &x[8*offset])); \ } \ \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return (PRIM_TYPE##16)(vload8(offset, x), vload8(offset+8, x)); \ + return (PRIM_TYPE##16)(vload8(0, &x[16*offset]), vload8(1, &x[16*offset])); \ } \ #define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ @@ -27,12 +27,13 @@ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ -//int/uint are special... see below #define VLOAD_TYPES() \ VLOAD_ADDR_SPACES(char) \ VLOAD_ADDR_SPACES(uchar) \ VLOAD_ADDR_SPACES(short) \ VLOAD_ADDR_SPACES(ushort) \ + VLOAD_ADDR_SPACES(int) \ + VLOAD_ADDR_SPACES(uint) \ VLOAD_ADDR_SPACES(long) \ VLOAD_ADDR_SPACES(ulong) \ VLOAD_ADDR_SPACES(float) \ @@ -43,54 +44,3 @@ VLOAD_TYPES() #pragma OPENCL EXTENSION cl_khr_fp64 : enable VLOAD_ADDR_SPACES(double) #endif - -VLOAD_VECTORIZE(int, __private) -VLOAD_VECTORIZE(int, __local) -VLOAD_VECTORIZE(int, __constant) -VLOAD_VECTORIZE(uint, __private) -VLOAD_VECTORIZE(uint, __local) -VLOAD_VECTORIZE(uint, __constant) - -_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) { - return (int2)(x[offset] , x[offset+1]); -} -_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) { - return (int3)(vload2(offset, x), x[offset+2]); -} -_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) { - return (uint2)(x[offset] , x[offset+1]); -} -_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) { - return (uint3)(vload2(offset, x), x[offset+2]); -} - -/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so - * they aren't actually overridden here - */ -_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *); -_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *); -_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *); - -_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) { - return __clc_vload4_int__global(offset, x); -} -_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) { - return __clc_vload8_int__global(offset, x); -} -_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) { - return __clc_vload16_int__global(offset, x); -} - -_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *); -_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *); -_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *); - -_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) { - return __clc_vload4_uint__global(offset, x); -} -_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) { - return __clc_vload8_uint__global(offset, x); -} -_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) { - return __clc_vload16_uint__global(offset, x); -} diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll deleted file mode 100644 index 2634d37..0000000 --- a/generic/lib/shared/vload_if.ll +++ /dev/null @@ -1,60 +0,0 @@ -;Start int global vload - -declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y) -declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y) -declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y) -declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y) -declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y) - -define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y) - ret <2 x i32> %call -} - -define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y) - ret <3 x i32> %call -} - -define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y) - ret <4 x i32> %call -} - -define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y) - ret <8 x i32> %call -} - -define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y) - ret <16 x i32> %call -} - - -;Start uint global vload - -define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y) - ret <2 x i32> %call -} - -define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y) - ret <3 x i32> %call -} - -define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y) - ret <4 x i32> %call -} - -define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y) - ret <8 x i32> %call -} - -define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y) - ret <16 x i32> %call -} diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll index ae719e0..2e70e5f 100644 --- a/generic/lib/shared/vload_impl.ll +++ b/generic/lib/shared/vload_impl.ll @@ -1,43 +1,33 @@ ; This provides optimized implementations of vload4/8/16 for 32-bit int/uint -define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)* - %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <2 x i32> %4 +define <2 x i32> @__clc_vload2_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <2 x i32> addrspace(1)* + %2 = load <2 x i32> addrspace(1)* %1, align 4, !tbaa !3 + ret <2 x i32> %2 } -define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)* - %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <3 x i32> %4 +define <3 x i32> @__clc_vload3_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <3 x i32> addrspace(1)* + %2 = load <3 x i32> addrspace(1)* %1, align 4, !tbaa !3 + ret <3 x i32> %2 } -define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)* - %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <4 x i32> %4 +define <4 x i32> @__clc_vload4_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <4 x i32> addrspace(1)* + %2 = load <4 x i32> addrspace(1)* %1, align 4, !tbaa !3 + ret <4 x i32> %2 } -define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)* - %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <8 x i32> %4 +define <8 x i32> @__clc_vload8_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <8 x i32> addrspace(1)* + %2 = load <8 x i32> addrspace(1)* %1, align 4, !tbaa !3 + ret <8 x i32> %2 } -define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)* - %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <16 x i32> %4 +define <16 x i32> @__clc_vload16_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <16 x i32> addrspace(1)* + %2 = load <16 x i32> addrspace(1)* %1, align 4, !tbaa !3 + ret <16 x i32> %2 } !1 = metadata !{metadata !"char", metadata !5} diff --git a/generic/lib/shared/vstore.cl b/generic/lib/shared/vstore.cl index 17c2c4c..f6d360e 100644 --- a/generic/lib/shared/vstore.cl +++ b/generic/lib/shared/vstore.cl @@ -4,29 +4,29 @@ #define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - mem[offset] = vec.s0; \ - mem[offset+1] = vec.s1; \ + mem[2*offset] = vec.s0; \ + mem[2*offset+1] = vec.s1; \ } \ \ _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - mem[offset] = vec.s0; \ - mem[offset+1] = vec.s1; \ - mem[offset+2] = vec.s2; \ + mem[3*offset] = vec.s0; \ + mem[3*offset+1] = vec.s1; \ + mem[3*offset+2] = vec.s2; \ } \ \ _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - vstore2(vec.lo, offset, mem); \ - vstore2(vec.hi, offset+2, mem); \ + vstore2(vec.lo, 0, &mem[offset*4]); \ + vstore2(vec.hi, 1, &mem[offset*4]); \ } \ \ _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - vstore4(vec.lo, offset, mem); \ - vstore4(vec.hi, offset+4, mem); \ + vstore4(vec.lo, 0, &mem[offset*8]); \ + vstore4(vec.hi, 1, &mem[offset*8]); \ } \ \ _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - vstore8(vec.lo, offset, mem); \ - vstore8(vec.hi, offset+8, mem); \ + vstore8(vec.lo, 0, &mem[offset*16]); \ + vstore8(vec.hi, 1, &mem[offset*16]); \ } \ #define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ @@ -34,12 +34,13 @@ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ -//int/uint are special... see below #define VSTORE_TYPES() \ VSTORE_ADDR_SPACES(char) \ VSTORE_ADDR_SPACES(uchar) \ VSTORE_ADDR_SPACES(short) \ VSTORE_ADDR_SPACES(ushort) \ + VSTORE_ADDR_SPACES(int) \ + VSTORE_ADDR_SPACES(uint) \ VSTORE_ADDR_SPACES(long) \ VSTORE_ADDR_SPACES(ulong) \ VSTORE_ADDR_SPACES(float) \ @@ -50,58 +51,3 @@ VSTORE_TYPES() #pragma OPENCL EXTENSION cl_khr_fp64 : enable VSTORE_ADDR_SPACES(double) #endif - -VSTORE_VECTORIZE(int, __private) -VSTORE_VECTORIZE(int, __local) -VSTORE_VECTORIZE(uint, __private) -VSTORE_VECTORIZE(uint, __local) - -_CLC_OVERLOAD _CLC_DEF void vstore2(int2 vec, size_t offset, global int *mem) { - mem[offset] = vec.s0; - mem[offset+1] = vec.s1; -} -_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) { - mem[offset] = vec.s0; - mem[offset+1] = vec.s1; - mem[offset+2] = vec.s2; -} -_CLC_OVERLOAD _CLC_DEF void vstore2(uint2 vec, size_t offset, global uint *mem) { - mem[offset] = vec.s0; - mem[offset+1] = vec.s1; -} -_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) { - mem[offset] = vec.s0; - mem[offset+1] = vec.s1; - mem[offset+2] = vec.s2; -} - -/*Note: R600 probably doesn't support store <2 x ?> and <3 x ?>... so - * they aren't actually overridden here... lowest-common-denominator - */ -_CLC_DECL void __clc_vstore4_int__global(int4 vec, size_t offset, __global int *); -_CLC_DECL void __clc_vstore8_int__global(int8 vec, size_t offset, __global int *); -_CLC_DECL void __clc_vstore16_int__global(int16 vec, size_t offset, __global int *); - -_CLC_OVERLOAD _CLC_DEF void vstore4(int4 vec, size_t offset, global int *x) { - __clc_vstore4_int__global(vec, offset, x); -} -_CLC_OVERLOAD _CLC_DEF void vstore8(int8 vec, size_t offset, global int *x) { - __clc_vstore8_int__global(vec, offset, x); -} -_CLC_OVERLOAD _CLC_DEF void vstore16(int16 vec, size_t offset, global int *x) { - __clc_vstore16_int__global(vec, offset, x); -} - -_CLC_DECL void __clc_vstore4_uint__global(uint4 vec, size_t offset, __global uint *); -_CLC_DECL void __clc_vstore8_uint__global(uint8 vec, size_t offset, __global uint *); -_CLC_DECL void __clc_vstore16_uint__global(uint16 vec, size_t offset, __global uint *); - -_CLC_OVERLOAD _CLC_DEF void vstore4(uint4 vec, size_t offset, global uint *x) { - __clc_vstore4_uint__global(vec, offset, x); -} -_CLC_OVERLOAD _CLC_DEF void vstore8(uint8 vec, size_t offset, global uint *x) { - __clc_vstore8_uint__global(vec, offset, x); -} -_CLC_OVERLOAD _CLC_DEF void vstore16(uint16 vec, size_t offset, global uint *x) { - __clc_vstore16_uint__global(vec, offset, x); -} diff --git a/generic/lib/shared/vstore_if.ll b/generic/lib/shared/vstore_if.ll deleted file mode 100644 index 30eb552..0000000 --- a/generic/lib/shared/vstore_if.ll +++ /dev/null @@ -1,59 +0,0 @@ -;Start int global vstore - -declare void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y) -declare void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y) -declare void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y) -declare void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y) -declare void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y) - -define void @__clc_vstore2_int__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore3_int__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore4_int__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore8_int__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore16_int__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y) - ret void -} - - -;Start uint global vstore -define void @__clc_vstore2_uint__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore3_uint__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore4_uint__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore8_uint__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore16_uint__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y) - ret void -}
\ No newline at end of file diff --git a/generic/lib/shared/vstore_impl.ll b/generic/lib/shared/vstore_impl.ll index 3baab5e..388bce2 100644 --- a/generic/lib/shared/vstore_impl.ll +++ b/generic/lib/shared/vstore_impl.ll @@ -1,46 +1,35 @@ ; This provides optimized implementations of vstore4/8/16 for 32-bit int/uint -define void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)* - store <2 x i32> %vec, <2 x i32> addrspace(1)* %3, align 4, !tbaa !3 +define void @__clc_vstore2_i32__addr1(<2 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <2 x i32> addrspace(1)* + store <2 x i32> %vec, <2 x i32> addrspace(1)* %1, align 4, !tbaa !3 ret void } -define void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)* - store <3 x i32> %vec, <3 x i32> addrspace(1)* %3, align 4, !tbaa !3 +define void @__clc_vstore3_i32__addr1(<3 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <3 x i32> addrspace(1)* + store <3 x i32> %vec, <3 x i32> addrspace(1)* %1, align 4, !tbaa !3 ret void } -define void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)* - store <4 x i32> %vec, <4 x i32> addrspace(1)* %3, align 4, !tbaa !3 +define void @__clc_vstore4_i32__addr1(<4 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <4 x i32> addrspace(1)* + store <4 x i32> %vec, <4 x i32> addrspace(1)* %1, align 4, !tbaa !3 ret void } -define void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)* - store <8 x i32> %vec, <8 x i32> addrspace(1)* %3, align 4, !tbaa !3 +define void @__clc_vstore8_i32__addr1(<8 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <8 x i32> addrspace(1)* + store <8 x i32> %vec, <8 x i32> addrspace(1)* %1, align 4, !tbaa !3 ret void } -define void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)* - store <16 x i32> %vec, <16 x i32> addrspace(1)* %3, align 4, !tbaa !3 +define void @__clc_vstore16_i32__addr1(<16 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <16 x i32> addrspace(1)* + store <16 x i32> %vec, <16 x i32> addrspace(1)* %1, align 4, !tbaa !3 ret void } - !1 = metadata !{metadata !"char", metadata !5} !2 = metadata !{metadata !"short", metadata !5} !3 = metadata !{metadata !"int", metadata !5} diff --git a/ptx/lib/OVERRIDES b/ptx/lib/OVERRIDES new file mode 100644 index 0000000..475162c --- /dev/null +++ b/ptx/lib/OVERRIDES @@ -0,0 +1,2 @@ +integer/add_sat_if.ll +integer/sub_sat_if.ll diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES index 16ef3ac..87df0b7 100644 --- a/r600/lib/SOURCES +++ b/r600/lib/SOURCES @@ -4,3 +4,5 @@ workitem/get_local_id.ll workitem/get_global_size.ll synchronization/barrier.cl synchronization/barrier_impl.ll +shared/vload.cl +shared/vstore.cl
\ No newline at end of file diff --git a/r600/lib/shared/vload.cl b/r600/lib/shared/vload.cl new file mode 100644 index 0000000..6144dde --- /dev/null +++ b/r600/lib/shared/vload.cl @@ -0,0 +1,92 @@ +#include <clc/clc.h> + +#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return (PRIM_TYPE##2)(x[2*offset] , x[2*offset+1]); \ + } \ +\ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return (PRIM_TYPE##3)(x[3*offset] , x[3*offset+1], x[3*offset+2]); \ + } \ +\ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return (PRIM_TYPE##4)(x[4*offset], x[4*offset+1], x[4*offset+2], x[4*offset+3]); \ + } \ +\ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return (PRIM_TYPE##8)(vload4(0, &x[8*offset]), vload4(1, &x[8*offset])); \ + } \ +\ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return (PRIM_TYPE##16)(vload8(0, &x[16*offset]), vload8(1, &x[16*offset])); \ + } \ + +#define VLOAD_ADDR_SPACES(SCALAR_GENTYPE) \ + VLOAD_VECTORIZE(SCALAR_GENTYPE, __private) \ + VLOAD_VECTORIZE(SCALAR_GENTYPE, __local) \ + VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \ + VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \ + +//int/uint are special... see below +#define VLOAD_TYPES() \ + VLOAD_ADDR_SPACES(char) \ + VLOAD_ADDR_SPACES(uchar) \ + VLOAD_ADDR_SPACES(short) \ + VLOAD_ADDR_SPACES(ushort) \ + VLOAD_ADDR_SPACES(long) \ + VLOAD_ADDR_SPACES(ulong) \ + VLOAD_ADDR_SPACES(float) \ + +VLOAD_TYPES() + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + VLOAD_ADDR_SPACES(double) +#endif + +//Assembly overrides start here + +VLOAD_VECTORIZE(int, __private) +VLOAD_VECTORIZE(int, __local) +VLOAD_VECTORIZE(int, __constant) +VLOAD_VECTORIZE(uint, __private) +VLOAD_VECTORIZE(uint, __local) +VLOAD_VECTORIZE(uint, __constant) + +_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) { + return (int3)(vload2(0, &x[3*offset]), x[3*offset+2]); +} +_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) { + return (uint3)(vload2(0, &x[3*offset]), x[3*offset+2]); +} + +//We only define functions for typeN vloadN(), and then just bitcast the result for unsigned types +#define _CLC_VLOAD_ASM_DECL(PRIM_TYPE,LLVM_SCALAR_TYPE,ADDR_SPACE,ADDR_SPACE_ID) \ +_CLC_DECL PRIM_TYPE##2 __clc_vload2_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (const ADDR_SPACE PRIM_TYPE *); \ +_CLC_DECL PRIM_TYPE##4 __clc_vload4_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (const ADDR_SPACE PRIM_TYPE *); \ +_CLC_DECL PRIM_TYPE##8 __clc_vload8_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (const ADDR_SPACE PRIM_TYPE *); \ +_CLC_DECL PRIM_TYPE##16 __clc_vload16_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (const ADDR_SPACE PRIM_TYPE *); \ + +#define _CLC_VLOAD_ASM_DEFINE(PRIM_TYPE,S_PRIM_TYPE, LLVM_SCALAR_TYPE,VEC_WIDTH,ADDR_SPACE,ADDR_SPACE_ID) \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##VEC_WIDTH vload##VEC_WIDTH (size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return __builtin_astype(__clc_vload##VEC_WIDTH##_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID ((const ADDR_SPACE S_PRIM_TYPE *)&x[VEC_WIDTH * offset]), PRIM_TYPE##VEC_WIDTH); \ + } \ + +/*Note: R600 back-end doesn't support load <3 x ?>... so + * those functions aren't actually overridden here + */ +#define _CLC_VLOAD_ASM_OVERLOAD_SIZES(PRIM_TYPE,S_PRIM_TYPE,LLVM_TYPE,ADDR_SPACE,ADDR_SPACE_ID) \ + _CLC_VLOAD_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 2, ADDR_SPACE, ADDR_SPACE_ID) \ + _CLC_VLOAD_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 4, ADDR_SPACE, ADDR_SPACE_ID) \ + _CLC_VLOAD_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 8, ADDR_SPACE, ADDR_SPACE_ID) \ + _CLC_VLOAD_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 16, ADDR_SPACE, ADDR_SPACE_ID) \ + +#define _CLC_VLOAD_ASM_OVERLOAD_ADDR_SPACES(PRIM_TYPE,S_PRIM_TYPE,LLVM_TYPE) \ + _CLC_VLOAD_ASM_OVERLOAD_SIZES(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, global, 1) \ + +#define _CLC_VLOAD_ASM_OVERLOADS() \ + _CLC_VLOAD_ASM_DECL(int,i32,__global,1) \ + _CLC_VLOAD_ASM_OVERLOAD_ADDR_SPACES(int,int,i32) \ + _CLC_VLOAD_ASM_OVERLOAD_ADDR_SPACES(uint,int,i32) \ + +_CLC_VLOAD_ASM_OVERLOADS()
\ No newline at end of file diff --git a/r600/lib/shared/vstore.cl b/r600/lib/shared/vstore.cl new file mode 100644 index 0000000..a150849 --- /dev/null +++ b/r600/lib/shared/vstore.cl @@ -0,0 +1,104 @@ +#include <clc/clc.h> + +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable + +#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + mem[2*offset] = vec.s0; \ + mem[2*offset+1] = vec.s1; \ + } \ +\ + _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + mem[3*offset] = vec.s0; \ + mem[3*offset+1] = vec.s1; \ + mem[3*offset+2] = vec.s2; \ + } \ +\ + _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + vstore2(vec.lo, 0, &mem[offset*4]); \ + vstore2(vec.hi, 1, &mem[offset*4]); \ + } \ +\ + _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + vstore4(vec.lo, 0, &mem[offset*8]); \ + vstore4(vec.hi, 1, &mem[offset*8]); \ + } \ +\ + _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + vstore8(vec.lo, 0, &mem[offset*16]); \ + vstore8(vec.hi, 1, &mem[offset*16]); \ + } \ + +#define VSTORE_ADDR_SPACES(SCALAR_GENTYPE) \ + VSTORE_VECTORIZE(SCALAR_GENTYPE, __private) \ + VSTORE_VECTORIZE(SCALAR_GENTYPE, __local) \ + VSTORE_VECTORIZE(SCALAR_GENTYPE, __global) \ + +//int/uint are special... see below +#define VSTORE_TYPES() \ + VSTORE_ADDR_SPACES(char) \ + VSTORE_ADDR_SPACES(uchar) \ + VSTORE_ADDR_SPACES(short) \ + VSTORE_ADDR_SPACES(ushort) \ + VSTORE_ADDR_SPACES(long) \ + VSTORE_ADDR_SPACES(ulong) \ + VSTORE_ADDR_SPACES(float) \ + +VSTORE_TYPES() + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + VSTORE_ADDR_SPACES(double) +#endif + +VSTORE_VECTORIZE(int, __private) +VSTORE_VECTORIZE(int, __local) +VSTORE_VECTORIZE(uint, __private) +VSTORE_VECTORIZE(uint, __local) + +_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) { + mem[3*offset] = vec.s0; + mem[3*offset+1] = vec.s1; + mem[3*offset+2] = vec.s2; +} +_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) { + mem[3*offset] = vec.s0; + mem[3*offset+1] = vec.s1; + mem[3*offset+2] = vec.s2; +} + +/*Note: R600 doesn't support store <3 x ?>... so + * those functions aren't actually overridden here... lowest-common-denominator + */ + +//We only define functions for signed_type vstoreN(), and then just cast the pointers/vectors for unsigned types +#define _CLC_VSTORE_ASM_DECL(PRIM_TYPE,LLVM_SCALAR_TYPE,ADDR_SPACE,ADDR_SPACE_ID) \ +_CLC_DECL void __clc_vstore2_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (PRIM_TYPE##2, ADDR_SPACE PRIM_TYPE *); \ +_CLC_DECL void __clc_vstore4_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (PRIM_TYPE##4, ADDR_SPACE PRIM_TYPE *); \ +_CLC_DECL void __clc_vstore8_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (PRIM_TYPE##8, ADDR_SPACE PRIM_TYPE *); \ +_CLC_DECL void __clc_vstore16_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (PRIM_TYPE##16, ADDR_SPACE PRIM_TYPE *); \ + +#define _CLC_VSTORE_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_SCALAR_TYPE, VEC_WIDTH, ADDR_SPACE, ADDR_SPACE_ID) \ + _CLC_OVERLOAD _CLC_DEF void vstore##VEC_WIDTH(PRIM_TYPE##VEC_WIDTH vec, size_t offset, ADDR_SPACE PRIM_TYPE *x) { \ + __clc_vstore##VEC_WIDTH##_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (__builtin_astype(vec, S_PRIM_TYPE##VEC_WIDTH), (ADDR_SPACE S_PRIM_TYPE *)&x[ VEC_WIDTH * offset]); \ + } \ + +/*Note: R600 back-end doesn't support load <3 x ?>... so + * those functions aren't actually overridden here... When the back-end supports + * that, then clean add here, and remove the vstore3 definitions from above. + */ +#define _CLC_VSTORE_ASM_OVERLOAD_SIZES(PRIM_TYPE,S_PRIM_TYPE,LLVM_TYPE,ADDR_SPACE,ADDR_SPACE_ID) \ + _CLC_VSTORE_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 2, ADDR_SPACE, ADDR_SPACE_ID) \ + _CLC_VSTORE_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 4, ADDR_SPACE, ADDR_SPACE_ID) \ + _CLC_VSTORE_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 8, ADDR_SPACE, ADDR_SPACE_ID) \ + _CLC_VSTORE_ASM_DEFINE(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, 16, ADDR_SPACE, ADDR_SPACE_ID) \ + +#define _CLC_VSTORE_ASM_OVERLOAD_ADDR_SPACES(PRIM_TYPE,S_PRIM_TYPE,LLVM_TYPE) \ + _CLC_VSTORE_ASM_OVERLOAD_SIZES(PRIM_TYPE, S_PRIM_TYPE, LLVM_TYPE, global, 1) \ + +#define _CLC_VSTORE_ASM_OVERLOADS() \ + _CLC_VSTORE_ASM_DECL(int,i32,__global,1) \ + _CLC_VSTORE_ASM_OVERLOAD_ADDR_SPACES(int,int,i32) \ + _CLC_VSTORE_ASM_OVERLOAD_ADDR_SPACES(uint,int,i32) \ + +_CLC_VSTORE_ASM_OVERLOADS()
\ No newline at end of file diff --git a/utils/prepare-builtins.cpp b/utils/prepare-builtins.cpp index be1624b..4ad21e8 100644 --- a/utils/prepare-builtins.cpp +++ b/utils/prepare-builtins.cpp @@ -10,6 +10,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/system_error.h" #include "llvm/Support/ToolOutputFile.h" +#include "llvm/Config/config.h" using namespace llvm; @@ -66,7 +67,11 @@ int main(int argc, char **argv) { std::string ErrorInfo; OwningPtr<tool_output_file> Out (new tool_output_file(OutputFilename.c_str(), ErrorInfo, +#if LLVM_VERSION_MAJOR > 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR > 3) + sys::fs::F_Binary)); +#else raw_fd_ostream::F_Binary)); +#endif if (!ErrorInfo.empty()) { errs() << ErrorInfo << '\n'; exit(1); |