From f00e6edf208fa61a2c5a7c3885eb2c639cf108c5 Mon Sep 17 00:00:00 2001 From: Denis Steckelmacher Date: Mon, 22 Aug 2011 18:08:38 +0200 Subject: Document CPU files. --- src/core/cpu/buffer.h | 19 +++++- src/core/cpu/builtins.cpp | 12 +++- src/core/cpu/builtins.h | 73 +++++++++++++++++++++++ src/core/cpu/device.h | 28 ++++++++- src/core/cpu/kernel.h | 145 ++++++++++++++++++++++++++++++++++++++++++---- src/core/cpu/program.h | 35 ++++++++++- src/core/cpu/worker.h | 12 ++++ 7 files changed, 305 insertions(+), 19 deletions(-) diff --git a/src/core/cpu/buffer.h b/src/core/cpu/buffer.h index 5da39a3..5ca901a 100644 --- a/src/core/cpu/buffer.h +++ b/src/core/cpu/buffer.h @@ -25,6 +25,11 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/** + * \file buffer.h + * \brief CPU buffer + */ + #ifndef __CPU_BUFFER_H__ #define __CPU_BUFFER_H__ @@ -36,15 +41,27 @@ namespace Coal class CPUDevice; class MemObject; +/** + * \brief CPU implementation of \c Coal::MemObject + * + * This class is responsible of the actual allocation of buffer objects, using + * \c malloc() or by reusing a given \c host_ptr. + */ class CPUBuffer : public DeviceBuffer { public: + /** + * \brief Constructor + * \param device Device for which the buffer is allocated + * \param buffer \c Coal::MemObject holding information about the buffer + * \param rs return code (\c CL_SUCCESS if all is good) + */ CPUBuffer(CPUDevice *device, MemObject *buffer, cl_int *rs); ~CPUBuffer(); bool allocate(); DeviceInterface *device() const; - void *data() const; + void *data() const; /*!< \brief Pointer to the buffer's data */ void *nativeGlobalPointer() const; bool allocated() const; diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp index 217e55d..98a6e65 100644 --- a/src/core/cpu/builtins.cpp +++ b/src/core/cpu/builtins.cpp @@ -28,6 +28,12 @@ /** * \file cpu/builtins.cpp * \brief Native OpenCL C built-in functions + * + * All these built-ins are directly called by kernels. When the LLVM JIT + * sees a function name it doesn't know, it calls \c getBuiltin() with this + * name as parameter. This function then returns the address of an actual + * function implementation, that finally gets called by the kernel when + * it is run. */ #include "builtins.h" @@ -64,9 +70,9 @@ unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z, /* * TLS-related functions */ -__thread Coal::CPUKernelWorkGroup *g_work_group; -__thread void *work_items_data; -__thread size_t work_items_size; +__thread Coal::CPUKernelWorkGroup *g_work_group; /*!< \brief \c Coal::CPUKernelWorkGroup currently running on this thread */ +__thread void *work_items_data; /*!< \brief Space allocated for work-items stacks, see \ref barrier */ +__thread size_t work_items_size; /*!< \brief Size of \c work_items_data, see \ref barrier */ void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current) { diff --git a/src/core/cpu/builtins.h b/src/core/cpu/builtins.h index 9328cf3..71ffea3 100644 --- a/src/core/cpu/builtins.h +++ b/src/core/cpu/builtins.h @@ -25,6 +25,10 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/** + * \file builtins.h + * \brief CPU built-in functions + */ #ifndef __BUILTINS_H__ #define __BUILTINS_H__ @@ -34,11 +38,65 @@ namespace Coal { class CPUKernelWorkGroup; } +/** + * \brief Set the current kernel work-group of this thread + * \param current \c Coal::CPUKernelWorkGroup to be set in \c g_work_group. + */ void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current); + +/** + * \brief Return the address of a built-in function given its name + * \param name name of the built-in whose address is requested + */ void *getBuiltin(const std::string &name); + +/** + * \brief Work-item stacks + * \see \ref barrier + * \param size size of the allocated space for stacks + * \return address of the allocated space for stacks + */ void *getWorkItemsData(size_t &size); + +/** + * \brief Set work-item stacks + * \see \ref barrier + * \param ptr address of allocated space for stacks + * \param size size of the allocated space for stacks + */ void setWorkItemsData(void *ptr, size_t size); +/** + * \brief Increment a n-component vector given a maximum value + * + * This function is used to increment a vector for which a set of maximum values + * each of its element can reach before the next is incremented. + * + * For example, if \p dims is \c 3, \p vec starts at {0, 0, 0} and + * \p maxs if {2, 3, 1}, repeatedly calling this function with the + * same vector will produce the following results : + * + * \code + * {0, 0, 1} + * {0, 1, 0} + * {0, 1, 1} + * {0, 2, 0} + * {0, 2, 1} + * {0, 3, 0} + * {0, 3, 1} + * {1, 0, 0} + * ... + * \endcode + * + * Until \p vec reaches {2, 3, 1}. + * + * \param dims number of elements in the vectors + * \param vec vector whose elements will be incremented + * \param maxs vector containing a maximum value above which each corresponding + * element of \p vec cannot go. + * \return false if the increment was ok, true if \p vec was already at it's + * maximum value and couldn't be further incremented. + */ template bool incVec(unsigned long dims, T *vec, T *maxs) { @@ -63,6 +121,21 @@ bool incVec(unsigned long dims, T *vec, T *maxs) return overflow; } +/** + * \brief Address of a pixel in an image + * + * This function is heavily used when Clover needs to address a pixel or a byte + * in a rectangular or three-dimensional image or buffer. + * + * \param base address of the first pixel in the image (address of the image itself) + * \param x X coordinate, cannot be bigger or equal to \c width + * \param y Y coordinate, cannot be bigger or equal to \c height + * \param z Z coordinate, cannot be bigger or equal to \c depth (1 for 2D arrays) + * \param row_pitch size in bytes of a row of pixels in the image + * \param slice_pitch size in bytes of a slice in a 3D array + * \param bytes_per_pixel bytes per pixel (1 for simple buffers), used when + * coordinates are in pixels and not in bytes. + */ unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z, size_t row_pitch, size_t slice_pitch, unsigned int bytes_per_pixel); diff --git a/src/core/cpu/device.h b/src/core/cpu/device.h index 3a15b2b..36a6bb3 100644 --- a/src/core/cpu/device.h +++ b/src/core/cpu/device.h @@ -25,6 +25,11 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/** + * \file cpu/device.h + * \brief CPU device + */ + #ifndef __CPU_DEVICE_H__ #define __CPU_DEVICE_H__ @@ -41,12 +46,31 @@ class Event; class Program; class Kernel; +/** + * \brief CPU device + * + * This class is the base of all the CPU-accelerated OpenCL processing. It + * creates and manages subclasses such as \c Coal::DeviceBuffer, + * \c Coal::DeviceProgram and \c Coal::DeviceKernel. + * + * This class and the aforementioned ones work together to compile and run + * kernels using the LLVM JIT, manage buffers, provide built-in functions + * and do all of this in a multithreaded fashion using worker threads. + * + * \see \ref events + */ class CPUDevice : public DeviceInterface { public: CPUDevice(); ~CPUDevice(); + /** + * \brief Initialize the CPU device + * + * This function creates the worker threads and get information about + * the host system for the \c numCPUs() and \c cpuMhz functions. + */ void init(); cl_int info(cl_device_info param_name, @@ -65,8 +89,8 @@ class CPUDevice : public DeviceInterface void pushEvent(Event *event); Event *getEvent(bool &stop); - unsigned int numCPUs() const; - float cpuMhz() const; + unsigned int numCPUs() const; /*!< \brief Number of logical CPU cores on the system */ + float cpuMhz() const; /*!< \brief Speed of the CPU in Mhz */ private: unsigned int p_cores, p_num_events; diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h index 70e348c..c965741 100644 --- a/src/core/cpu/kernel.h +++ b/src/core/cpu/kernel.h @@ -25,6 +25,11 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/** + * \file cpu/kernel.h + * \brief CPU kernel + */ + #ifndef __CPU_KERNEL_H__ #define __CPU_KERNEL_H__ @@ -53,9 +58,26 @@ class KernelEvent; class Image2D; class Image3D; +/** + * \brief CPU kernel + * + * This class holds passive information about a kernel (\c Coal::Kernel object + * and device on which it is run) and provides the \c callFunction() function. + * + * This function is described at the end of \ref llvm . + * + * \see Coal::CPUKernelWorkGroup + */ class CPUKernel : public DeviceKernel { public: + /** + * \brief Constructor + * \param device device on which the kernel will be run + * \param kernel \c Coal::Kernel object holding information about this + * kernel + * \param function \c llvm::Function to run + */ CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function); ~CPUKernel(); @@ -66,11 +88,45 @@ class CPUKernel : public DeviceKernel size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim, size_t global_work_size) const; - Kernel *kernel() const; - CPUDevice *device() const; - - llvm::Function *function() const; - llvm::Function *callFunction(); + Kernel *kernel() const; /*!< \brief \c Coal::Kernel object this kernel will run */ + CPUDevice *device() const; /*!< \brief device on which the kernel will be run */ + + llvm::Function *function() const; /*!< \brief \c llvm::Function representing the kernel but not to be run */ + llvm::Function *callFunction(); /*!< \brief stub function used to run the kernel, see \ref llvm */ + + /** + * \brief Calculate where to place a value in an array + * + * This function is used to calculate where to place a value in an + * array given its size, properly aligning it. + * + * This function is called repeatedly to obtain the aligned position of + * each value that must be place in the array + * + * \code + * size_t array_len = 0, array_offset = 0; + * void *array; + * + * // First, get the array size given alignment constraints + * typeOffset(array_len, sizeof(int)); + * typeOffset(array_len, sizeof(float)); + * typeOffset(array_len, sizeof(void *)); + * + * // Then, allocate memory + * array = malloc(array_len) + * + * // Finally, place the arguments + * *(int *)((char *)array + typeOffset(array_offset, sizeof(int))) = 1337; + * *(float *)((char *)array + typeOffset(array_offset, sizeof(int))) = 3.1415f; + * *(void **)((char *)array + typeOffset(array_offset, sizeof(int))) = array; + * \endcode + * + * \param offset offset at which the value will be placed. This variable + * gets incremented by type_len + padding. + * \param type_len size in bytes of the value that will be stored + * \return offset at which the value will be stored (equal to \p offset + * before incrementation. + */ static size_t typeOffset(size_t &offset, size_t type_len); private: @@ -82,18 +138,64 @@ class CPUKernel : public DeviceKernel class CPUKernelEvent; +/** + * \brief CPU kernel work-group + * + * This class represent a bulk of work-items that will be run. It is the one + * to actually run the kernel of its elements. + * + * \see \ref llvm + * \nosubgrouping + */ class CPUKernelWorkGroup { public: + /** + * \brief Constructor + * \param kernel kernel to run + * \param event event containing information about the kernel run + * \param cpu_event CPU-specific information and cache about \p event + * \param work_group_index index of this work-group in the kernel + */ CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event, CPUKernelEvent *cpu_event, const size_t *work_group_index); ~CPUKernelWorkGroup(); + /** + * \brief Build a structure of arguments + * + * As C doesn't support calling functions with variable arguments + * unknown at the compilation, this function builds the list of + * arguments in memory. This array will then be passed to a LLVM stub + * function reading it and passing its values to the actuel kernel. + * + * \see \ref llvm + * \param locals_to_free if this kernel takes \c __local arguments, they + * must be \c malloc()'ed for every work-group. + * They are placed in this vector to be + * \c free()'ed at the end of \c run(). + * \return address of a memory location containing the arguments + */ void *callArgs(std::vector &locals_to_free); + + /** + * \brief Run the work-group + * + * This function is the core of CPU-acceleration. It runs the work-items + * of this work-group given the correct arguments. + * + * \see \ref llvm + * \see \ref barrier + * \see callArgs() + * \return true if success, false in case of an error + */ bool run(); - // Native functions + /** + * \name Native implementation of built-in OpenCL C functions + * @{ + */ size_t getGlobalId(cl_uint dimindx) const; cl_uint getWorkDim() const; size_t getGlobalSize(cl_uint dimindx) const; @@ -124,7 +226,13 @@ class CPUKernelWorkGroup uint32_t sampler) const; void readImage(uint32_t *result, Image2D *image, float x, float y, float z, uint32_t sampler) const; + /** + * @} + */ + /** + * \brief Function called when a built-in name cannot be found + */ void builtinNotFound(const std::string &name) const; private: @@ -174,20 +282,33 @@ class CPUKernelWorkGroup bool p_had_barrier; }; +/** + * \brief CPU-specific information about a kernel event + * + * This class put in a \c Coal::KernelEvent device-data field + * (see \c Coal::Event::setDeviceData()) is responsible for dispatching the + * \c Coal::CPUKernelWorkGroup objects between the CPU worker threads. + */ class CPUKernelEvent { public: + /** + * \brief Constructor + * \param device device running the kernel + * \param event \c Coal::KernelEvent holding device-agnostic data + * about the event + */ CPUKernelEvent(CPUDevice *device, KernelEvent *event); ~CPUKernelEvent(); - bool reserve(); /*!< The next Work Group that will execute will be the last. Locks the event */ - bool finished(); /*!< All the work groups have finished */ - CPUKernelWorkGroup *takeInstance(); /*!< Must be called exactly one time after reserve(). Unlocks the event */ + bool reserve(); /*!< \brief The next Work Group that will execute will be the last. Locks the event */ + bool finished(); /*!< \brief All the work groups have finished */ + CPUKernelWorkGroup *takeInstance(); /*!< \brief Must be called exactly one time after reserve(). Unlocks the event */ - void *kernelArgs() const; - void cacheKernelArgs(void *args); + void *kernelArgs() const; /*!< \brief Return the cached kernel arguments */ + void cacheKernelArgs(void *args); /*!< \brief Cache pre-built kernel arguments */ - void workGroupFinished(); + void workGroupFinished(); /*!< \brief A work-group has just finished */ private: CPUDevice *p_device; diff --git a/src/core/cpu/program.h b/src/core/cpu/program.h index 34668f2..350d248 100644 --- a/src/core/cpu/program.h +++ b/src/core/cpu/program.h @@ -25,6 +25,11 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/** + * \file cpu/program.h + * \brief CPU program + */ + #ifndef __CPU_PROGRAM_H__ #define __CPU_PROGRAM_H__ @@ -42,9 +47,23 @@ namespace Coal class CPUDevice; class Program; +/** + * \brief CPU program + * + * This class implements the \c Coal::DeviceProgram interface for CPU + * acceleration. + * + * It's main purpose is to initialize a \c llvm::JIT object to run LLVM bitcode, + * in \c initJIT(). + */ class CPUProgram : public DeviceProgram { public: + /** + * \brief Constructor + * \param device CPU device to which this program is attached + * \param program \c Coal::Program that will be run + */ CPUProgram(CPUDevice *device, Program *program); ~CPUProgram(); @@ -52,8 +71,22 @@ class CPUProgram : public DeviceProgram void createOptimizationPasses(llvm::PassManager *manager, bool optimize); bool build(llvm::Module *module); + /** + * \brief Initialize an LLVM JIT + * + * This function creates a \c llvm::JIT object to run this program on + * the CPU. A few implementation details : + * + * - The JIT is set not to resolve unknown symbols using \c dlsym(). + * This way, a malicious kernel cannot execute arbitrary code on + * the host by declaring \c libc functions and calling them. + * - All the unknown function names are passed to \c getBuiltin() to + * get native built-in implementations. + * + * \return true if success, false otherwise + */ bool initJIT(); - llvm::ExecutionEngine *jit() const; + llvm::ExecutionEngine *jit() const; /*!< \brief Current LLVM execution engine */ private: CPUDevice *p_device; diff --git a/src/core/cpu/worker.h b/src/core/cpu/worker.h index ec462f7..66130c6 100644 --- a/src/core/cpu/worker.h +++ b/src/core/cpu/worker.h @@ -25,9 +25,21 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/** + * \file worker.h + * \brief Function run by the CPU worker threads + */ + #ifndef __CPU_WORKER_H__ #define __CPU_WORKER_H__ +/** + * \brief Main loop of the CPU worker threads + * + * This function is run by as many thread as they are CPU cores on the host + * system. As explained by \ref events , this function waits until there + * are \c Coal::Event objects to process and handle them. + */ void *worker(void *data); #endif -- cgit v1.2.3