summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYang Rong <rong.r.yang@intel.com>2013-07-02 15:22:24 +0800
committerZhigang Gong <zhigang.gong@linux.intel.com>2013-07-03 18:19:42 +0800
commit046834c47eb579eb4b66df366ebb405c2bf97a1d (patch)
treee2c72fd2009b8f4f3a26b59674b1bec37ad0c73c
parent942f9f9277b5c22723734883989eb676b4ab3d0b (diff)
Fix atomic test failed in GT1.
Barrier only ensure one work group finish, can't guarantee all work item's atomic ops have finished before the last atomic_add. So use atomic_xchg to update first work group's local buffer to other global buffer position. Signed-off-by: Yang Rong <rong.r.yang@intel.com> Reviewed-by: Xing, Homer <homer.xing@intel.com>
-rw-r--r--kernels/compiler_atomic_functions.cl19
-rw-r--r--utests/compiler_atomic_functions.cpp23
2 files changed, 25 insertions, 17 deletions
diff --git a/kernels/compiler_atomic_functions.cl b/kernels/compiler_atomic_functions.cl
index 61ce2f44..fbc16fbf 100644
--- a/kernels/compiler_atomic_functions.cl
+++ b/kernels/compiler_atomic_functions.cl
@@ -1,14 +1,21 @@
__kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __global int *src) {
int lid = get_local_id(0);
int i = lid % 12;
- atomic_xchg(&tmp[4], -1);
+ if(lid == 0) {
+ for(int j=0; j<12; j=j+1) {
+ atomic_xchg(&tmp[j], 0);
+ }
+ atomic_xchg(&tmp[4], -1);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
switch(i) {
case 0: atomic_inc(&tmp[i]); break;
case 1: atomic_dec(&tmp[i]); break;
case 2: atomic_add(&tmp[i], src[lid]); break;
case 3: atomic_sub(&tmp[i], src[lid]); break;
- case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 4))); break;
- case 5: atomic_or (&tmp[i], src[lid]<<(lid / 4)); break;
+ case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 16))); break;
+ case 5: atomic_or (&tmp[i], src[lid]<<(lid / 16)); break;
case 6: atomic_xor(&tmp[i], src[lid]); break;
case 7: atomic_min(&tmp[i], -src[lid]); break;
case 8: atomic_max(&tmp[i], src[lid]); break;
@@ -23,8 +30,8 @@ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __g
case 1: atomic_dec(&dst[i]); break;
case 2: atomic_add(&dst[i], src[lid]); break;
case 3: atomic_sub(&dst[i], src[lid]); break;
- case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 4))); break;
- case 5: atomic_or (&dst[i], src[lid]<<(lid / 4)); break;
+ case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 16))); break;
+ case 5: atomic_or (&dst[i], src[lid]<<(lid / 16)); break;
case 6: atomic_xor(&dst[i], src[lid]); break;
case 7: atomic_min(&dst[i], -src[lid]); break;
case 8: atomic_max(&dst[i], src[lid]); break;
@@ -38,6 +45,6 @@ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __g
if(get_global_id(0) == 0) {
for(i=0; i<12; i=i+1)
- atomic_add(&dst[i], tmp[i]);
+ atomic_xchg(&dst[i+12], tmp[i]);
}
}
diff --git a/utests/compiler_atomic_functions.cpp b/utests/compiler_atomic_functions.cpp
index 571e0c67..65f1c5a7 100644
--- a/utests/compiler_atomic_functions.cpp
+++ b/utests/compiler_atomic_functions.cpp
@@ -4,12 +4,12 @@
#include <string.h>
#define GROUP_NUM 16
-#define LOCAL_SIZE 64
+#define LOCAL_SIZE 256
static void cpu_compiler_atomic(int *dst, int *src)
{
dst[4] = 0xffffffff;
int tmp[16] = { 0 };
-
+ tmp[4] = -1;
for(int j=0; j<LOCAL_SIZE; j++) {
int i = j % 12;
@@ -18,8 +18,8 @@ static void cpu_compiler_atomic(int *dst, int *src)
case 1: tmp[i] -= 1; break;
case 2: tmp[i] += src[j]; break;
case 3: tmp[i] -= src[j]; break;
- case 4: tmp[i] &= ~(src[j]<<(j>>2)); break;
- case 5: tmp[i] |= src[j]<<(j>>2); break;
+ case 4: tmp[i] &= ~(src[j]<<(j>>4)); break;
+ case 5: tmp[i] |= src[j]<<(j>>4); break;
case 6: tmp[i] ^= src[j]; break;
case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
@@ -39,8 +39,8 @@ static void cpu_compiler_atomic(int *dst, int *src)
case 1: dst[i] -= 1; break;
case 2: dst[i] += src[j]; break;
case 3: dst[i] -= src[j]; break;
- case 4: dst[i] &= ~(src[j]<<(j>>2)); break;
- case 5: dst[i] |= src[j]<<(j>>2); break;
+ case 4: dst[i] &= ~(src[j]<<(j>>4)); break;
+ case 5: dst[i] |= src[j]<<(j>>4); break;
case 6: dst[i] ^= src[j]; break;
case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
@@ -53,27 +53,28 @@ static void cpu_compiler_atomic(int *dst, int *src)
}
for(int i=0; i<12; i++)
- dst[i] += tmp[i];
+ dst[i+12] = tmp[i];
}
static void compiler_atomic_functions(void)
{
const size_t n = GROUP_NUM * LOCAL_SIZE;
- int cpu_dst[16] = {0}, cpu_src[256];
+ int cpu_dst[24] = {0}, cpu_src[256];
globals[0] = n;
locals[0] = LOCAL_SIZE;
// Setup kernel and buffers
OCL_CREATE_KERNEL("compiler_atomic_functions");
- OCL_CREATE_BUFFER(buf[0], 0, 16 * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[0], 0, 24 * sizeof(int), NULL);
OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
OCL_SET_ARG(1, 16 * sizeof(int), NULL);
OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
OCL_MAP_BUFFER(0);
- memset(buf_data[0], 0, 16 * sizeof(int));
+ memset(buf_data[0], 0, 24 * sizeof(int));
+ ((int *)buf_data[0])[4] = -1;
OCL_UNMAP_BUFFER(0);
OCL_MAP_BUFFER(1);
@@ -86,7 +87,7 @@ static void compiler_atomic_functions(void)
OCL_MAP_BUFFER(0);
// Check results
- for(int i=0; i<12; i++) {
+ for(int i=0; i<24; i++) {
//printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
}