From d925208c27d8b2fdff4dc31f3f5d6a67cbe5b26f Mon Sep 17 00:00:00 2001
From: Dmitry Cherkassov <dcherkassov@gmail.com>
Date: Sun, 7 Apr 2013 00:22:34 +0400
Subject: R600: Add @llvm.AMDGPU.tex intrinsic generation for read_imagef.
 [WIP]

This patch adds read read_imagef function to generate target intrinsics.

Some details:
@llvm.AMDGPU.dummytex instruction is added insted of @llvm.AMDGPU.tex
because target intrinsic has (<4 x float>, i32, i32, i32) signature
and recent opencl frontend generates opencl.image_t* type for image_t.

So the workaround is to patch that function call to @llvm.AMDGPU.tex
in llvm pass.

Signed-off-by: Dmitry Cherkassov <dcherkassov@gmail.com>
---
 generic/include/clc/clctypes.h      | 10 +++++++++
 generic/include/clc/image/image2d.h |  1 +
 generic/lib/SOURCES                 |  2 ++
 generic/lib/image/image2d.cl        | 10 +++++++++
 generic/lib/image/image2d.ll        | 42 +++++++++++++++++++++++++++++++++++++
 5 files changed, 65 insertions(+)
 create mode 100644 generic/include/clc/image/image2d.h
 create mode 100644 generic/lib/image/image2d.cl
 create mode 100644 generic/lib/image/image2d.ll

diff --git a/generic/include/clc/clctypes.h b/generic/include/clc/clctypes.h
index ca729f7..4787320 100644
--- a/generic/include/clc/clctypes.h
+++ b/generic/include/clc/clctypes.h
@@ -7,6 +7,16 @@ typedef unsigned short ushort;
 typedef unsigned int uint;
 typedef unsigned long ulong;
 
+/* image types */
+#define CLK_FILTER_NEAREST 0 
+#define CLK_FILTER_LINEAR 1
+
+#define CLK_NORMALIZED_COORDS_FALSE 0
+#define CLK_NORMALIZED_COORDS_TRUE 1
+#define CLK_ADDRESS_CLAMP_TO_EDGE 2
+#define CLK_ADDRESS_REPEAT 4
+
+
 /* 6.1.2 Built-in Vector Data Types */
 
 typedef __attribute__((ext_vector_type(2))) char char2;
diff --git a/generic/include/clc/image/image2d.h b/generic/include/clc/image/image2d.h
new file mode 100644
index 0000000..1c6a6ce
--- /dev/null
+++ b/generic/include/clc/image/image2d.h
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DEF float4 read_imagef (image2d_t, sampler_t, float2);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 9ac08bd..eeedecd 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -32,3 +32,5 @@ shared/vstore.cl
 shared/vstore_impl.ll
 workitem/get_global_id.cl
 workitem/get_global_size.cl
+image/image2d.cl
+image/image2d.ll
diff --git a/generic/lib/image/image2d.cl b/generic/lib/image/image2d.cl
new file mode 100644
index 0000000..24315a5
--- /dev/null
+++ b/generic/lib/image/image2d.cl
@@ -0,0 +1,10 @@
+#define cl_clang_storage_class_specifiers
+#include <clc/clc.h>
+#include <clc/image/image2d.h>
+
+_CLC_DECL inline float4  __read_imagef(image2d_t, sampler_t, float2);
+
+_CLC_OVERLOAD _CLC_DEF inline float4 read_imagef (image2d_t image, sampler_t sampler, float2 coord) {
+	return __read_imagef(image, sampler, coord);
+}
+
diff --git a/generic/lib/image/image2d.ll b/generic/lib/image/image2d.ll
new file mode 100644
index 0000000..c65937a
--- /dev/null
+++ b/generic/lib/image/image2d.ll
@@ -0,0 +1,42 @@
+%opencl.image2d_t = type opaque
+
+; declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+
+declare <4 x float> @llvm.AMDGPU.dummytex(<4 x float>, %opencl.image2d_t*, i32, i32) readnone
+
+; NOTE: image=resoureID, sampler=sampler_id must be immeds, so they are patched by separate llvm pass
+
+; define <4 x float> @__read_imagef_norm(i32 %image, i32 %sampler, <4 x float> %coord) nounwind readnone alwaysinline {
+; 	%call = call <4 x float> @llvm.AMDGPU.tex (<4 x float> %coord, i32 0, i32 0, i32 1)
+; 	ret <4 x float> %call
+; }
+
+; define <4 x float> @__read_imagef_unorm(i32 %image, i32 %sampler, <4 x float> %coord) nounwind readnone alwaysinline {
+; 	%call = call <4 x float> @llvm.AMDGPU.tex (<4 x float> %coord, i32 0, i32 0, i32 0)
+; 	ret <4 x float> %call
+; }
+
+define <4 x float> @__read_imagef(%opencl.image2d_t* nocapture %image, i32 %sampler, <2 x float> %coord) alwaysinline {
+entry:
+  %0 = extractelement <2 x float> %coord, i32 0
+  %vecinit = insertelement <4 x float> undef, float %0, i32 0
+  %1 = extractelement <2 x float> %coord, i32 1
+  %vecinit1 = insertelement <4 x float> %vecinit, float %1, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
+  %tobool = icmp eq i32 %sampler, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call.i = call <4 x float> @llvm.AMDGPU.dummytex(<4 x float> %vecinit3, %opencl.image2d_t* %image, i32 %sampler, i32 1)
+  br label %return
+
+if.else:                                          ; preds = %entry
+  %call.i1 = call <4 x float> @llvm.AMDGPU.dummytex(<4 x float> %vecinit3, %opencl.image2d_t* %image, i32 %sampler, i32 0) 
+  br label %return
+
+return:                                           ; preds = %if.else, %if.then
+  %retval.0 = phi <4 x float> [ %call.i, %if.then ], [ %call.i1, %if.else ]
+  ret <4 x float> %retval.0
+}
-- 
cgit v1.2.3