/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Rong Yang
*/
//#include "cl_image.h"
#include "cl_enqueue.h"
#include "cl_driver.h"
#include "cl_event.h"
#include "cl_command_queue.h"
#include "cl_utils.h"
#include "cl_alloc.h"
#include "cl_device_enqueue.h"
#include
#include
#include
#include
static cl_int
cl_enqueue_read_buffer(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
cl_mem mem = data->mem_obj;
if (status != CL_COMPLETE)
return err;
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE);
struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
//cl_buffer_get_subdata sometime is very very very slow in linux kernel, in skl and chv,
//and it is randomly. So temporary disable it, use map/copy/unmap to read.
//Should re-enable it after find root cause.
if (0 && !mem->is_userptr) {
if (cl_buffer_get_subdata(mem->bo, data->offset + buffer->sub_offset,
data->size, data->ptr) != 0)
err = CL_MAP_FAILURE;
} else {
void *src_ptr = cl_mem_map_auto(mem, 0);
if (src_ptr == NULL)
err = CL_MAP_FAILURE;
else {
//sometimes, application invokes read buffer, instead of map buffer, even if userptr is enabled
//memcpy is not necessary for this case
if (data->ptr != (char *)src_ptr + data->offset + buffer->sub_offset)
memcpy(data->ptr, (char *)src_ptr + data->offset + buffer->sub_offset, data->size);
cl_mem_unmap_auto(mem);
}
}
return err;
}
static cl_int
cl_enqueue_read_buffer_rect(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
void *src_ptr;
void *dst_ptr;
const size_t *origin = data->origin;
const size_t *host_origin = data->host_origin;
const size_t *region = data->region;
cl_mem mem = data->mem_obj;
if (status != CL_COMPLETE)
return err;
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE);
struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
if (!(src_ptr = cl_mem_map_auto(mem, 0))) {
err = CL_MAP_FAILURE;
goto error;
}
size_t offset = origin[0] + data->row_pitch * origin[1] + data->slice_pitch * origin[2];
src_ptr = (char *)src_ptr + offset + buffer->sub_offset;
offset = host_origin[0] + data->host_row_pitch * host_origin[1] + data->host_slice_pitch * host_origin[2];
dst_ptr = (char *)data->ptr + offset;
if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
(region[2] == 1 || (data->slice_pitch == region[0] * region[1] && data->slice_pitch == data->host_slice_pitch))) {
memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]);
} else {
cl_uint y, z;
for (z = 0; z < region[2]; z++) {
const char *src = src_ptr;
char *dst = dst_ptr;
for (y = 0; y < region[1]; y++) {
memcpy(dst, src, region[0]);
src += data->row_pitch;
dst += data->host_row_pitch;
}
src_ptr = (char *)src_ptr + data->slice_pitch;
dst_ptr = (char *)dst_ptr + data->host_slice_pitch;
}
}
err = cl_mem_unmap_auto(mem);
error:
return err;
}
static cl_int
cl_enqueue_write_buffer(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
cl_mem mem = data->mem_obj;
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE);
struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
if (status != CL_COMPLETE)
return err;
if (mem->is_userptr) {
void *dst_ptr = cl_mem_map_auto(mem, 1);
if (dst_ptr == NULL)
err = CL_MAP_FAILURE;
else {
memcpy((char *)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
cl_mem_unmap_auto(mem);
}
} else {
if (cl_buffer_subdata(mem->bo, data->offset + buffer->sub_offset,
data->size, data->const_ptr) != 0)
err = CL_MAP_FAILURE;
}
return err;
}
static cl_int
cl_enqueue_write_buffer_rect(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
void *src_ptr;
void *dst_ptr;
const size_t *origin = data->origin;
const size_t *host_origin = data->host_origin;
const size_t *region = data->region;
cl_mem mem = data->mem_obj;
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE);
struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
if (status != CL_COMPLETE)
return err;
if (!(dst_ptr = cl_mem_map_auto(mem, 1))) {
err = CL_MAP_FAILURE;
goto error;
}
size_t offset = origin[0] + data->row_pitch * origin[1] + data->slice_pitch * origin[2];
dst_ptr = (char *)dst_ptr + offset + buffer->sub_offset;
offset = host_origin[0] + data->host_row_pitch * host_origin[1] + data->host_slice_pitch * host_origin[2];
src_ptr = (char *)data->const_ptr + offset;
if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
(region[2] == 1 || (data->slice_pitch == region[0] * region[1] && data->slice_pitch == data->host_slice_pitch))) {
memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]);
} else {
cl_uint y, z;
for (z = 0; z < region[2]; z++) {
const char *src = src_ptr;
char *dst = dst_ptr;
for (y = 0; y < region[1]; y++) {
memcpy(dst, src, region[0]);
src += data->host_row_pitch;
dst += data->row_pitch;
}
src_ptr = (char *)src_ptr + data->host_slice_pitch;
dst_ptr = (char *)dst_ptr + data->slice_pitch;
}
}
err = cl_mem_unmap_auto(mem);
error:
return err;
}
static cl_int
cl_enqueue_read_image(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
void *src_ptr;
cl_mem mem = data->mem_obj;
CHECK_IMAGE(mem, image);
const size_t *origin = data->origin;
const size_t *region = data->region;
if (status != CL_COMPLETE)
return err;
if (!(src_ptr = cl_mem_map_auto(mem, 0))) {
err = CL_MAP_FAILURE;
goto error;
}
size_t offset = image->offset + image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
src_ptr = (char*)src_ptr + offset;
if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
(region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch))) {
memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]);
} else {
cl_uint y, z;
for (z = 0; z < region[2]; z++) {
const char *src = src_ptr;
char *dst = data->ptr;
for (y = 0; y < region[1]; y++) {
memcpy(dst, src, image->bpp * region[0]);
src += image->row_pitch;
dst += data->row_pitch;
}
src_ptr = (char *)src_ptr + image->slice_pitch;
data->ptr = (char *)data->ptr + data->slice_pitch;
}
}
err = cl_mem_unmap_auto(mem);
error:
return err;
}
static cl_int
cl_enqueue_write_image(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
void *dst_ptr;
cl_mem mem = data->mem_obj;
CHECK_IMAGE(mem, image);
if (status != CL_COMPLETE)
return err;
if (!(dst_ptr = cl_mem_map_auto(mem, 1))) {
err = CL_MAP_FAILURE;
goto error;
}
cl_mem_copy_image_region(data->origin, data->region,
dst_ptr + image->offset,
image->row_pitch, image->slice_pitch,
data->const_ptr, data->row_pitch,
data->slice_pitch, image, CL_TRUE, CL_FALSE);
err = cl_mem_unmap_auto(mem);
error:
return err;
}
static cl_int
cl_enqueue_map_buffer(enqueue_data *data, cl_int status)
{
void *ptr = NULL;
cl_int err = CL_SUCCESS;
cl_mem mem = data->mem_obj;
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE ||
mem->type == CL_MEM_SVM_TYPE);
struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer *)mem;
if (status == CL_SUBMITTED) {
if (buffer->base.is_userptr) {
ptr = buffer->base.host_ptr;
} else {
if ((ptr = cl_mem_map_gtt_unsync(&buffer->base)) == NULL) {
err = CL_MAP_FAILURE;
return err;
}
}
data->ptr = ptr;
} else if (status == CL_COMPLETE) {
if (mem->is_userptr)
ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
else {
if (data->unsync_map == 1)
//because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
ptr = cl_mem_map_gtt(mem);
else
ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
}
if (ptr == NULL) {
err = CL_MAP_FAILURE;
return err;
}
data->ptr = ptr;
if ((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) {
assert(mem->host_ptr);
ptr = (char *)ptr + data->offset + buffer->sub_offset;
memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size);
}
}
return err;
}
static cl_int
cl_enqueue_map_image(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
cl_mem mem = data->mem_obj;
void *ptr = NULL;
size_t row_pitch = 0;
CHECK_IMAGE(mem, image);
if (status == CL_SUBMITTED) {
if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) {
err = CL_MAP_FAILURE;
goto error;
}
data->ptr = ptr;
} else if (status == CL_COMPLETE) {
if (data->unsync_map == 1)
//because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
ptr = cl_mem_map_gtt(mem);
else
ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
if (ptr == NULL) {
err = CL_MAP_FAILURE;
goto error;
}
data->ptr = (char*)ptr + image->offset;
if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
row_pitch = image->slice_pitch;
else
row_pitch = image->row_pitch;
if(mem->flags & CL_MEM_USE_HOST_PTR) {
assert(mem->host_ptr);
if (!mem->is_userptr)
//src and dst need add offset in function cl_mem_copy_image_region
cl_mem_copy_image_region(data->origin, data->region,
mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
}
}
error:
return err;
}
static cl_int
cl_enqueue_unmap_mem_object(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
int i, j;
size_t mapped_size = 0;
size_t origin[3], region[3];
void *v_ptr = NULL;
void *mapped_ptr = data->ptr;
cl_mem memobj = data->mem_obj;
size_t row_pitch = 0;
if (status != CL_COMPLETE)
return err;
assert(memobj->mapped_ptr_sz >= memobj->map_ref);
INVALID_VALUE_IF(!mapped_ptr);
for (i = 0; i < memobj->mapped_ptr_sz; i++) {
if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
memobj->mapped_ptr[i].ptr = NULL;
mapped_size = memobj->mapped_ptr[i].size;
v_ptr = memobj->mapped_ptr[i].v_ptr;
for (j = 0; j < 3; j++) {
region[j] = memobj->mapped_ptr[i].region[j];
origin[j] = memobj->mapped_ptr[i].origin[j];
memobj->mapped_ptr[i].region[j] = 0;
memobj->mapped_ptr[i].origin[j] = 0;
}
memobj->mapped_ptr[i].size = 0;
memobj->mapped_ptr[i].v_ptr = NULL;
memobj->map_ref--;
break;
}
}
/* can not find a mapped address? */
INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
if (memobj->flags & CL_MEM_USE_HOST_PTR) {
if (memobj->type == CL_MEM_BUFFER_TYPE ||
memobj->type == CL_MEM_SUBBUFFER_TYPE ||
memobj->type == CL_MEM_SVM_TYPE) {
assert(mapped_ptr >= memobj->host_ptr &&
mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
/* Sync the data. */
if (!memobj->is_userptr)
memcpy(v_ptr, mapped_ptr, mapped_size);
} else {
CHECK_IMAGE(memobj, image);
if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
row_pitch = image->slice_pitch;
else
row_pitch = image->row_pitch;
if (!memobj->is_userptr)
//v_ptr have added offset, host_ptr have not added offset.
cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
image, CL_FALSE, CL_TRUE);
}
} else {
assert(v_ptr == mapped_ptr);
}
cl_mem_unmap_auto(memobj);
/* shrink the mapped slot. */
if (memobj->mapped_ptr_sz / 2 > memobj->map_ref) {
int j = 0;
cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz / 2));
if (!new_ptr) {
/* Just do nothing. */
goto error;
}
memset(new_ptr, 0, (memobj->mapped_ptr_sz / 2) * sizeof(cl_mapped_ptr));
for (i = 0; i < memobj->mapped_ptr_sz; i++) {
if (memobj->mapped_ptr[i].ptr) {
new_ptr[j] = memobj->mapped_ptr[i];
j++;
assert(j < memobj->mapped_ptr_sz / 2);
}
}
memobj->mapped_ptr_sz = memobj->mapped_ptr_sz / 2;
free(memobj->mapped_ptr);
memobj->mapped_ptr = new_ptr;
}
error:
return err;
}
static cl_int
cl_enqueue_native_kernel(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
cl_uint num_mem_objects = (cl_uint)data->offset;
const cl_mem *mem_list = data->mem_list;
const void **args_mem_loc = (const void **)data->const_ptr;
cl_uint i;
if (status != CL_COMPLETE)
return err;
for (i = 0; i < num_mem_objects; ++i) {
const cl_mem buffer = mem_list[i];
CHECK_MEM(buffer);
*((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer, 0);
}
data->user_func(data->ptr);
for (i = 0; i < num_mem_objects; ++i) {
cl_mem_unmap_auto(mem_list[i]);
}
error:
return err;
}
cl_int cl_enqueue_svm_free(enqueue_data *data, cl_int status) {
int i;
void **pointers = data->pointers;
uint num_svm_ptrs = data->size;
cl_int err = CL_SUCCESS;
if (status != CL_COMPLETE)
return err;
if(data->free_func) {
data->free_func(data->queue, num_svm_ptrs, pointers, data->ptr);
} else {
for(i=0; iqueue->ctx, pointers[i]);
}
free(pointers);
return CL_SUCCESS;
}
cl_int cl_enqueue_svm_mem_copy(enqueue_data *data, cl_int status) {
cl_mem mem;
size_t size = data->size;
const char* src_ptr = (const char *)data->const_ptr;
char *dst_ptr = (char *)data->ptr;
cl_int err = CL_SUCCESS;
int i;
if (status != CL_COMPLETE)
return err;
if((mem = cl_context_get_svm_from_ptr(data->queue->ctx, data->ptr)) != NULL) {
dst_ptr = (char *)cl_mem_map_auto(mem, 1);
}
if((mem = cl_context_get_svm_from_ptr(data->queue->ctx, data->const_ptr)) != NULL) {
src_ptr = (const char *)cl_mem_map_auto(mem, 0);
}
for(i=0; isize;
size_t pattern_size = data->pattern_size;
const char* pattern = (const char *)data->const_ptr;
char *ptr = (char *)data->ptr;
cl_int err = CL_SUCCESS;
int i, j;
if (status != CL_COMPLETE)
return err;
if((mem = cl_context_get_svm_from_ptr(data->queue->ctx, data->ptr)) != NULL) {
ptr = (char *)cl_mem_map_auto(mem, 1);
}
for(i=0; igpgpu);
//if it is the last ndrange of an cl enqueue api,
//check the device enqueue information.
if (data->mid_event_of_enq == 0) {
assert(data->queue);
cl_device_enqueue_parse_result(data->queue, data->gpgpu);
}
} else if (status == CL_COMPLETE) {
void *batch_buf = cl_gpgpu_ref_batch_buf(data->gpgpu);
cl_gpgpu_sync(batch_buf);
cl_gpgpu_unref_batch_buf(batch_buf);
}
return err;
}
static cl_int
cl_enqueue_marker_or_barrier(enqueue_data *data, cl_int status)
{
return CL_COMPLETE;
}
LOCAL void
cl_enqueue_delete(enqueue_data *data)
{
if (data == NULL)
return;
if (data->type == EnqueueCopyBufferRect ||
data->type == EnqueueCopyBuffer ||
data->type == EnqueueCopyImage ||
data->type == EnqueueCopyBufferToImage ||
data->type == EnqueueCopyImageToBuffer ||
data->type == EnqueueNDRangeKernel ||
data->type == EnqueueFillBuffer ||
data->type == EnqueueFillImage) {
if (data->gpgpu) {
cl_gpgpu_delete(data->gpgpu);
data->gpgpu = NULL;
}
return;
}
if (data->type == EnqueueNativeKernel) {
if (data->mem_list) {
cl_free((void*)data->mem_list);
data->mem_list = NULL;
}
if (data->ptr) {
cl_free((void*)data->ptr);
data->ptr = NULL;
}
if (data->const_ptr) {
cl_free((void*)data->const_ptr);
data->const_ptr = NULL;
}
}
}
LOCAL cl_int
cl_enqueue_handle(enqueue_data *data, cl_int status)
{
switch (data->type) {
case EnqueueReturnSuccesss:
return CL_SUCCESS;
case EnqueueReadBuffer:
return cl_enqueue_read_buffer(data, status);
case EnqueueReadBufferRect:
return cl_enqueue_read_buffer_rect(data, status);
case EnqueueWriteBuffer:
return cl_enqueue_write_buffer(data, status);
case EnqueueWriteBufferRect:
return cl_enqueue_write_buffer_rect(data, status);
case EnqueueReadImage:
return cl_enqueue_read_image(data, status);
case EnqueueWriteImage:
return cl_enqueue_write_image(data, status);
case EnqueueMapBuffer:
return cl_enqueue_map_buffer(data, status);
case EnqueueMapImage:
return cl_enqueue_map_image(data, status);
case EnqueueUnmapMemObject:
return cl_enqueue_unmap_mem_object(data, status);
case EnqueueSVMFree:
return cl_enqueue_svm_free(data, status);
case EnqueueSVMMemCopy:
return cl_enqueue_svm_mem_copy(data, status);
case EnqueueSVMMemFill:
return cl_enqueue_svm_mem_fill(data, status);
case EnqueueMarker:
case EnqueueBarrier:
return cl_enqueue_marker_or_barrier(data, status);
case EnqueueCopyBufferRect:
case EnqueueCopyBuffer:
case EnqueueCopyImage:
case EnqueueCopyBufferToImage:
case EnqueueCopyImageToBuffer:
case EnqueueNDRangeKernel:
case EnqueueFillBuffer:
case EnqueueFillImage:
//return cl_event_flush(event);
return cl_enqueue_ndrange(data, status);
case EnqueueNativeKernel:
return cl_enqueue_native_kernel(data, status);
case EnqueueMigrateMemObj:
default:
return CL_SUCCESS;
}
}