From d925208c27d8b2fdff4dc31f3f5d6a67cbe5b26f Mon Sep 17 00:00:00 2001 From: Dmitry Cherkassov Date: Sun, 7 Apr 2013 00:22:34 +0400 Subject: R600: Add @llvm.AMDGPU.tex intrinsic generation for read_imagef. [WIP] This patch adds read read_imagef function to generate target intrinsics. Some details: @llvm.AMDGPU.dummytex instruction is added insted of @llvm.AMDGPU.tex because target intrinsic has (<4 x float>, i32, i32, i32) signature and recent opencl frontend generates opencl.image_t* type for image_t. So the workaround is to patch that function call to @llvm.AMDGPU.tex in llvm pass. Signed-off-by: Dmitry Cherkassov --- generic/include/clc/clctypes.h | 10 +++++++++ generic/include/clc/image/image2d.h | 1 + generic/lib/SOURCES | 2 ++ generic/lib/image/image2d.cl | 10 +++++++++ generic/lib/image/image2d.ll | 42 +++++++++++++++++++++++++++++++++++++ 5 files changed, 65 insertions(+) create mode 100644 generic/include/clc/image/image2d.h create mode 100644 generic/lib/image/image2d.cl create mode 100644 generic/lib/image/image2d.ll diff --git a/generic/include/clc/clctypes.h b/generic/include/clc/clctypes.h index ca729f7..4787320 100644 --- a/generic/include/clc/clctypes.h +++ b/generic/include/clc/clctypes.h @@ -7,6 +7,16 @@ typedef unsigned short ushort; typedef unsigned int uint; typedef unsigned long ulong; +/* image types */ +#define CLK_FILTER_NEAREST 0 +#define CLK_FILTER_LINEAR 1 + +#define CLK_NORMALIZED_COORDS_FALSE 0 +#define CLK_NORMALIZED_COORDS_TRUE 1 +#define CLK_ADDRESS_CLAMP_TO_EDGE 2 +#define CLK_ADDRESS_REPEAT 4 + + /* 6.1.2 Built-in Vector Data Types */ typedef __attribute__((ext_vector_type(2))) char char2; diff --git a/generic/include/clc/image/image2d.h b/generic/include/clc/image/image2d.h new file mode 100644 index 0000000..1c6a6ce --- /dev/null +++ b/generic/include/clc/image/image2d.h @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DEF float4 read_imagef (image2d_t, sampler_t, float2); diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES index 9ac08bd..eeedecd 100644 --- a/generic/lib/SOURCES +++ b/generic/lib/SOURCES @@ -32,3 +32,5 @@ shared/vstore.cl shared/vstore_impl.ll workitem/get_global_id.cl workitem/get_global_size.cl +image/image2d.cl +image/image2d.ll diff --git a/generic/lib/image/image2d.cl b/generic/lib/image/image2d.cl new file mode 100644 index 0000000..24315a5 --- /dev/null +++ b/generic/lib/image/image2d.cl @@ -0,0 +1,10 @@ +#define cl_clang_storage_class_specifiers +#include +#include + +_CLC_DECL inline float4 __read_imagef(image2d_t, sampler_t, float2); + +_CLC_OVERLOAD _CLC_DEF inline float4 read_imagef (image2d_t image, sampler_t sampler, float2 coord) { + return __read_imagef(image, sampler, coord); +} + diff --git a/generic/lib/image/image2d.ll b/generic/lib/image/image2d.ll new file mode 100644 index 0000000..c65937a --- /dev/null +++ b/generic/lib/image/image2d.ll @@ -0,0 +1,42 @@ +%opencl.image2d_t = type opaque + +; declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone + +declare <4 x float> @llvm.AMDGPU.dummytex(<4 x float>, %opencl.image2d_t*, i32, i32) readnone + +; NOTE: image=resoureID, sampler=sampler_id must be immeds, so they are patched by separate llvm pass + +; define <4 x float> @__read_imagef_norm(i32 %image, i32 %sampler, <4 x float> %coord) nounwind readnone alwaysinline { +; %call = call <4 x float> @llvm.AMDGPU.tex (<4 x float> %coord, i32 0, i32 0, i32 1) +; ret <4 x float> %call +; } + +; define <4 x float> @__read_imagef_unorm(i32 %image, i32 %sampler, <4 x float> %coord) nounwind readnone alwaysinline { +; %call = call <4 x float> @llvm.AMDGPU.tex (<4 x float> %coord, i32 0, i32 0, i32 0) +; ret <4 x float> %call +; } + +define <4 x float> @__read_imagef(%opencl.image2d_t* nocapture %image, i32 %sampler, <2 x float> %coord) alwaysinline { +entry: + %0 = extractelement <2 x float> %coord, i32 0 + %vecinit = insertelement <4 x float> undef, float %0, i32 0 + %1 = extractelement <2 x float> %coord, i32 1 + %vecinit1 = insertelement <4 x float> %vecinit, float %1, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 + %tobool = icmp eq i32 %sampler, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %entry + %call.i = call <4 x float> @llvm.AMDGPU.dummytex(<4 x float> %vecinit3, %opencl.image2d_t* %image, i32 %sampler, i32 1) + br label %return + +if.else: ; preds = %entry + %call.i1 = call <4 x float> @llvm.AMDGPU.dummytex(<4 x float> %vecinit3, %opencl.image2d_t* %image, i32 %sampler, i32 0) + br label %return + +return: ; preds = %if.else, %if.then + %retval.0 = phi <4 x float> [ %call.i, %if.then ], [ %call.i1, %if.else ] + ret <4 x float> %retval.0 +} -- cgit v1.2.3