summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDmitry Cherkassov <dcherkassov@gmail.com>2013-04-07 00:22:34 +0400
committerTom Stellard <thomas.stellard@amd.com>2013-07-24 10:14:24 -0700
commitd925208c27d8b2fdff4dc31f3f5d6a67cbe5b26f (patch)
tree570aa6655d4a9487f86855bdf150ddfbfac7ece2
parentf2cf99288cca34bebe9f131704e984524ff09a42 (diff)
R600: Add @llvm.AMDGPU.tex intrinsic generation for read_imagef. [WIP]
This patch adds read read_imagef function to generate target intrinsics. Some details: @llvm.AMDGPU.dummytex instruction is added insted of @llvm.AMDGPU.tex because target intrinsic has (<4 x float>, i32, i32, i32) signature and recent opencl frontend generates opencl.image_t* type for image_t. So the workaround is to patch that function call to @llvm.AMDGPU.tex in llvm pass. Signed-off-by: Dmitry Cherkassov <dcherkassov@gmail.com>
-rw-r--r--generic/include/clc/clctypes.h10
-rw-r--r--generic/include/clc/image/image2d.h1
-rw-r--r--generic/lib/SOURCES2
-rw-r--r--generic/lib/image/image2d.cl10
-rw-r--r--generic/lib/image/image2d.ll42
5 files changed, 65 insertions, 0 deletions
diff --git a/generic/include/clc/clctypes.h b/generic/include/clc/clctypes.h
index ca729f7..4787320 100644
--- a/generic/include/clc/clctypes.h
+++ b/generic/include/clc/clctypes.h
@@ -7,6 +7,16 @@ typedef unsigned short ushort;
typedef unsigned int uint;
typedef unsigned long ulong;
+/* image types */
+#define CLK_FILTER_NEAREST 0
+#define CLK_FILTER_LINEAR 1
+
+#define CLK_NORMALIZED_COORDS_FALSE 0
+#define CLK_NORMALIZED_COORDS_TRUE 1
+#define CLK_ADDRESS_CLAMP_TO_EDGE 2
+#define CLK_ADDRESS_REPEAT 4
+
+
/* 6.1.2 Built-in Vector Data Types */
typedef __attribute__((ext_vector_type(2))) char char2;
diff --git a/generic/include/clc/image/image2d.h b/generic/include/clc/image/image2d.h
new file mode 100644
index 0000000..1c6a6ce
--- /dev/null
+++ b/generic/include/clc/image/image2d.h
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DEF float4 read_imagef (image2d_t, sampler_t, float2);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 9ac08bd..eeedecd 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -32,3 +32,5 @@ shared/vstore.cl
shared/vstore_impl.ll
workitem/get_global_id.cl
workitem/get_global_size.cl
+image/image2d.cl
+image/image2d.ll
diff --git a/generic/lib/image/image2d.cl b/generic/lib/image/image2d.cl
new file mode 100644
index 0000000..24315a5
--- /dev/null
+++ b/generic/lib/image/image2d.cl
@@ -0,0 +1,10 @@
+#define cl_clang_storage_class_specifiers
+#include <clc/clc.h>
+#include <clc/image/image2d.h>
+
+_CLC_DECL inline float4 __read_imagef(image2d_t, sampler_t, float2);
+
+_CLC_OVERLOAD _CLC_DEF inline float4 read_imagef (image2d_t image, sampler_t sampler, float2 coord) {
+ return __read_imagef(image, sampler, coord);
+}
+
diff --git a/generic/lib/image/image2d.ll b/generic/lib/image/image2d.ll
new file mode 100644
index 0000000..c65937a
--- /dev/null
+++ b/generic/lib/image/image2d.ll
@@ -0,0 +1,42 @@
+%opencl.image2d_t = type opaque
+
+; declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+
+declare <4 x float> @llvm.AMDGPU.dummytex(<4 x float>, %opencl.image2d_t*, i32, i32) readnone
+
+; NOTE: image=resoureID, sampler=sampler_id must be immeds, so they are patched by separate llvm pass
+
+; define <4 x float> @__read_imagef_norm(i32 %image, i32 %sampler, <4 x float> %coord) nounwind readnone alwaysinline {
+; %call = call <4 x float> @llvm.AMDGPU.tex (<4 x float> %coord, i32 0, i32 0, i32 1)
+; ret <4 x float> %call
+; }
+
+; define <4 x float> @__read_imagef_unorm(i32 %image, i32 %sampler, <4 x float> %coord) nounwind readnone alwaysinline {
+; %call = call <4 x float> @llvm.AMDGPU.tex (<4 x float> %coord, i32 0, i32 0, i32 0)
+; ret <4 x float> %call
+; }
+
+define <4 x float> @__read_imagef(%opencl.image2d_t* nocapture %image, i32 %sampler, <2 x float> %coord) alwaysinline {
+entry:
+ %0 = extractelement <2 x float> %coord, i32 0
+ %vecinit = insertelement <4 x float> undef, float %0, i32 0
+ %1 = extractelement <2 x float> %coord, i32 1
+ %vecinit1 = insertelement <4 x float> %vecinit, float %1, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
+ %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
+ %tobool = icmp eq i32 %sampler, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %call.i = call <4 x float> @llvm.AMDGPU.dummytex(<4 x float> %vecinit3, %opencl.image2d_t* %image, i32 %sampler, i32 1)
+ br label %return
+
+if.else: ; preds = %entry
+ %call.i1 = call <4 x float> @llvm.AMDGPU.dummytex(<4 x float> %vecinit3, %opencl.image2d_t* %image, i32 %sampler, i32 0)
+ br label %return
+
+return: ; preds = %if.else, %if.then
+ %retval.0 = phi <4 x float> [ %call.i, %if.then ], [ %call.i1, %if.else ]
+ ret <4 x float> %retval.0
+}