summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenis Steckelmacher <steckdenis@yahoo.fr>2011-07-27 13:51:15 +0200
committerDenis Steckelmacher <steckdenis@yahoo.fr>2011-07-27 13:51:15 +0200
commita1f3d501a7a0d73fc7594d5f0c3381744ed867a8 (patch)
tree9473ab7b45114f5c3b1e112efece840eb15afb0f
parent82d497c0aa6b8fb439f709eecbbc99d1e515579f (diff)
Optimizations and code factoring.
-rw-r--r--src/core/cpu/device.cpp58
-rw-r--r--src/core/cpu/device.h7
-rw-r--r--src/core/cpu/kernel.cpp33
-rw-r--r--src/core/cpu/kernel.h1
-rw-r--r--src/core/deviceinterface.h2
-rw-r--r--src/core/events.cpp70
-rw-r--r--src/core/events.h4
7 files changed, 90 insertions, 85 deletions
diff --git a/src/core/cpu/device.cpp b/src/core/cpu/device.cpp
index 97c6973..907cdf0 100644
--- a/src/core/cpu/device.cpp
+++ b/src/core/cpu/device.cpp
@@ -38,6 +38,30 @@ void CPUDevice::init()
pthread_cond_init(&p_events_cond, 0);
pthread_mutex_init(&p_events_mutex, 0);
+ // Get info about the system
+ p_cores = sysconf(_SC_NPROCESSORS_ONLN);
+ p_cpu_mhz = 0.0f;
+
+ std::filebuf fb;
+ fb.open("/proc/cpuinfo", std::ios::in);
+ std::istream is(&fb);
+
+ while (!is.eof())
+ {
+ std::string key, value;
+
+ std::getline(is, key, ':');
+ is.ignore(1);
+ std::getline(is, value);
+
+ if (key.compare(0, 7, "cpu MHz") == 0)
+ {
+ std::istringstream ss(value);
+ ss >> p_cpu_mhz;
+ break;
+ }
+ }
+
// Create worker threads
p_workers = (pthread_t *)std::malloc(numCPUs() * sizeof(pthread_t));
@@ -198,38 +222,14 @@ Event *CPUDevice::getEvent(bool &stop)
return event;
}
-unsigned int CPUDevice::numCPUs()
+unsigned int CPUDevice::numCPUs() const
{
- if (p_cores) return p_cores;
-
- return (p_cores = sysconf(_SC_NPROCESSORS_ONLN));
+ return p_cores;
}
-float CPUDevice::cpuMhz()
+float CPUDevice::cpuMhz() const
{
- std::filebuf fb;
- fb.open("/proc/cpuinfo", std::ios::in);
- std::istream is(&fb);
-
- float cpuMhz = 0.0;
-
- while (!is.eof())
- {
- std::string key, value;
-
- std::getline(is, key, ':');
- is.ignore(1);
- std::getline(is, value);
-
- if (key.compare(0, 7, "cpu MHz") == 0)
- {
- std::istringstream ss(value);
- ss >> cpuMhz;
- break;
- }
- }
-
- return cpuMhz;
+ return p_cpu_mhz;
}
// From inner parentheses to outher ones :
@@ -248,7 +248,7 @@ float CPUDevice::cpuMhz()
cl_int CPUDevice::info(cl_device_info param_name,
size_t param_value_size,
void *param_value,
- size_t *param_value_size_ret)
+ size_t *param_value_size_ret) const
{
void *value = 0;
size_t value_length = 0;
diff --git a/src/core/cpu/device.h b/src/core/cpu/device.h
index 7869476..fd38ff7 100644
--- a/src/core/cpu/device.h
+++ b/src/core/cpu/device.h
@@ -25,7 +25,7 @@ class CPUDevice : public DeviceInterface
cl_int info(cl_device_info param_name,
size_t param_value_size,
void *param_value,
- size_t *param_value_size_ret);
+ size_t *param_value_size_ret) const;
DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs);
DeviceProgram *createDeviceProgram(Program *program);
@@ -38,11 +38,12 @@ class CPUDevice : public DeviceInterface
void pushEvent(Event *event);
Event *getEvent(bool &stop);
- unsigned int numCPUs();
- float cpuMhz();
+ unsigned int numCPUs() const;
+ float cpuMhz() const;
private:
unsigned int p_cores, p_num_events;
+ float p_cpu_mhz;
pthread_t *p_workers;
std::list<Event *> p_events;
diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp
index 3c7c11a..a898522 100644
--- a/src/core/cpu/kernel.cpp
+++ b/src/core/cpu/kernel.cpp
@@ -132,7 +132,7 @@ size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
// Don't let the loop go up to global_work_size, the overhead would be
// too huge
- if (divisor > cpus * 32)
+ if (divisor > global_work_size || divisor > cpus * 32)
{
divisor = 1; // Not parallel but has no CommandQueue overhead
break;
@@ -411,20 +411,21 @@ CPUKernelWorkGroup *CPUKernelEvent::takeInstance()
CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
CPUKernelEvent *cpu_event,
const size_t *work_group_index)
-: p_kernel(kernel), p_event(event), p_cpu_event(cpu_event)
+: p_kernel(kernel), p_event(event), p_cpu_event(cpu_event),
+ p_work_dim(event->work_dim())
{
// Set index
- std::memcpy(p_index, work_group_index, event->work_dim() * sizeof(size_t));
+ std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t));
// Set maxs and global id
- for (unsigned int i=0; i<event->work_dim(); ++i)
+ for (unsigned int i=0; i<p_work_dim; ++i)
{
p_maxs[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n
// Set global id
- p_global_id[i] = (p_index[i] * p_event->local_work_size(i))
- + p_event->global_work_offset(i);
+ p_global_id[i] = (p_index[i] * event->local_work_size(i))
+ + event->global_work_offset(i);
}
}
@@ -436,7 +437,7 @@ CPUKernelWorkGroup::~CPUKernelWorkGroup()
bool CPUKernelWorkGroup::run()
{
// Set current pos to 0
- std::memset(p_current, 0, p_event->work_dim() * sizeof(size_t));
+ std::memset(p_current, 0, p_work_dim * sizeof(size_t));
// Get the kernel function to call
bool free_after = p_kernel->kernel()->needsLocalAllocation();
@@ -459,7 +460,7 @@ bool CPUKernelWorkGroup::run()
{
// Simply call the "call function", it and the builtins will do the rest
kernel_func_addr();
- } while (!incVec(p_event->work_dim(), p_current, p_maxs));
+ } while (!incVec(p_work_dim, p_current, p_maxs));
// We may have some cleanup to do
if (free_after)
@@ -478,12 +479,12 @@ bool CPUKernelWorkGroup::run()
cl_uint CPUKernelWorkGroup::getWorkDim() const
{
- return p_event->work_dim();
+ return p_work_dim;
}
size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const
{
- if (dimindx > p_event->work_dim())
+ if (dimindx > p_work_dim)
return 0;
return p_global_id[dimindx] + p_current[dimindx];
@@ -491,7 +492,7 @@ size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const
size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const
{
- if (dimindx > p_event->work_dim())
+ if (dimindx >p_work_dim)
return 1;
return p_event->global_work_size(dimindx);
@@ -499,7 +500,7 @@ size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const
size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const
{
- if (dimindx > p_event->work_dim())
+ if (dimindx > p_work_dim)
return 1;
return p_event->local_work_size(dimindx);
@@ -507,7 +508,7 @@ size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const
size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const
{
- if (dimindx > p_event->work_dim())
+ if (dimindx > p_work_dim)
return 0;
return p_current[dimindx];
@@ -515,7 +516,7 @@ size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const
size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const
{
- if (dimindx > p_event->work_dim())
+ if (dimindx > p_work_dim)
return 1;
return (p_event->global_work_size(dimindx) /
@@ -524,7 +525,7 @@ size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const
size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const
{
- if (dimindx > p_event->work_dim())
+ if (dimindx > p_work_dim)
return 0;
return p_index[dimindx];
@@ -532,7 +533,7 @@ size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const
size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const
{
- if (dimindx > p_event->work_dim())
+ if (dimindx > p_work_dim)
return 0;
return p_event->global_work_offset(dimindx);
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
index 7de36b1..91d1dfd 100644
--- a/src/core/cpu/kernel.h
+++ b/src/core/cpu/kernel.h
@@ -75,6 +75,7 @@ class CPUKernelWorkGroup
CPUKernel *p_kernel;
CPUKernelEvent *p_cpu_event;
KernelEvent *p_event;
+ cl_uint p_work_dim;
size_t p_index[MAX_WORK_DIMS],
p_current[MAX_WORK_DIMS],
p_maxs[MAX_WORK_DIMS],
diff --git a/src/core/deviceinterface.h b/src/core/deviceinterface.h
index d19e87a..361df2e 100644
--- a/src/core/deviceinterface.h
+++ b/src/core/deviceinterface.h
@@ -31,7 +31,7 @@ class DeviceInterface
virtual cl_int info(cl_device_info param_name,
size_t param_value_size,
void *param_value,
- size_t *param_value_size_ret) = 0;
+ size_t *param_value_size_ret) const = 0;
virtual DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs) = 0;
virtual DeviceProgram *createDeviceProgram(Program *program) = 0;
diff --git a/src/core/events.cpp b/src/core/events.cpp
index e7d8650..7dfdac8 100644
--- a/src/core/events.cpp
+++ b/src/core/events.cpp
@@ -42,29 +42,16 @@ BufferEvent::BufferEvent(CommandQueue *parent,
// Alignment of SubBuffers
DeviceInterface *device = 0;
- cl_uint align;
*errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
&device, 0);
- if (errcode_ret != CL_SUCCESS) return;
+ if (errcode_ret != CL_SUCCESS)
+ return;
- if (buffer->type() == MemObject::SubBuffer)
+ if (!isSubBufferAligned(buffer, device))
{
- *errcode_ret = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint),
- &align, 0);
-
- if (*errcode_ret != CL_SUCCESS) return;
-
- size_t mask = 0;
-
- for (int i=0; i<align; ++i)
- mask = 1 | (mask << 1);
-
- if (((SubBuffer *)buffer)->offset() | mask)
- {
- *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
- return;
- }
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
}
// Allocate the buffer for the device
@@ -80,6 +67,32 @@ MemObject *BufferEvent::buffer() const
return p_buffer;
}
+bool BufferEvent::isSubBufferAligned(const MemObject *buffer,
+ const DeviceInterface *device)
+{
+ cl_uint align;
+ cl_int rs;
+
+ if (buffer->type() != MemObject::SubBuffer)
+ return true;
+
+ rs = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint),
+ &align, 0);
+
+ if (rs != CL_SUCCESS)
+ return false;
+
+ size_t mask = 0;
+
+ for (int i=0; i<align; ++i)
+ mask = 1 | (mask << 1);
+
+ if (((SubBuffer *)buffer)->offset() | mask)
+ return false;
+
+ return true;
+}
+
ReadWriteBufferEvent::ReadWriteBufferEvent(CommandQueue *parent,
MemObject *buffer,
size_t offset,
@@ -502,25 +515,10 @@ KernelEvent::KernelEvent(CommandQueue *parent,
{
const MemObject *buffer = *(const MemObject **)(a.value(0));
- if (buffer->type() == MemObject::SubBuffer)
+ if (!BufferEvent::isSubBufferAligned(buffer, device))
{
- cl_uint align;
- *errcode_ret = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint),
- &align, 0);
-
- if (*errcode_ret != CL_SUCCESS)
- return;
-
- size_t mask = 0;
-
- for (int i=0; i<align; ++i)
- mask = 1 | (mask << 1);
-
- if (((SubBuffer *)buffer)->offset() | mask)
- {
- *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
- return;
- }
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
}
}
else if (a.kind() == Kernel::Arg::Image2D)
diff --git a/src/core/events.h b/src/core/events.h
index 8c4f985..6702ff0 100644
--- a/src/core/events.h
+++ b/src/core/events.h
@@ -12,6 +12,7 @@ namespace Coal
class MemObject;
class Kernel;
class DeviceKernel;
+class DeviceInterface;
class BufferEvent : public Event
{
@@ -24,6 +25,9 @@ class BufferEvent : public Event
MemObject *buffer() const;
+ static bool isSubBufferAligned(const MemObject *buffer,
+ const DeviceInterface *device);
+
private:
MemObject *p_buffer;
};