/**************************************************************************
 *
 * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 **************************************************************************/

#include <inttypes.h>
#include <poll.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <search.h>

#include "drm.h"
#include "drmtest.h"
#include "intel_batchbuffer.h"
#include "intel_bufmgr.h"
#include "intel_bufops.h"
#include "intel_chipset.h"
#include "intel_reg.h"
#include "veboxcopy.h"
#include "rendercopy.h"
#include "media_fill.h"
#include "ioctl_wrappers.h"
#include "sw_sync.h"
#include "i915/gem_mman.h"
#include "media_spin.h"
#include "gpgpu_fill.h"
#include "igt_aux.h"
#include "igt_rand.h"
#include "i830_reg.h"
#include "huc_copy.h"
#include <glib.h>

#include <i915_drm.h>

#define BCS_SWCTRL 0x22200
#define BCS_SRC_Y (1 << 0)
#define BCS_DST_Y (1 << 1)

/**
 * SECTION:intel_batchbuffer
 * @short_description: Batchbuffer and blitter support
 * @title: Batch Buffer
 * @include: igt.h
 *
 * This library provides some basic support for batchbuffers and using the
 * blitter engine based upon libdrm. A new batchbuffer is allocated with
 * intel_batchbuffer_alloc() and for simple blitter commands submitted with
 * intel_batchbuffer_flush().
 *
 * It also provides some convenient macros to easily emit commands into
 * batchbuffers. All those macros presume that a pointer to a #intel_batchbuffer
 * structure called batch is in scope. The basic macros are #BEGIN_BATCH,
 * #OUT_BATCH, #OUT_RELOC and #ADVANCE_BATCH.
 *
 * Note that this library's header pulls in the [i-g-t core](igt-gpu-tools-i-g-t-core.html)
 * library as a dependency.
 */

/**
 * intel_batchbuffer_align:
 * @batch: batchbuffer object
 * @align: value in bytes to which we want to align
 *
 * Aligns the current in-batch offset to the given value.
 *
 * Returns: Batchbuffer offset aligned to the given value.
 */
uint32_t
intel_batchbuffer_align(struct intel_batchbuffer *batch, uint32_t align)
{
	uint32_t offset = batch->ptr - batch->buffer;

	offset = ALIGN(offset, align);
	batch->ptr = batch->buffer + offset;
	return offset;
}

/**
 * intel_batchbuffer_subdata_alloc:
 * @batch: batchbuffer object
 * @size: amount of bytes need to allocate
 * @align: value in bytes to which we want to align
 *
 * Verify if sufficient @size within @batch is available to deny overflow.
 * Then allocate @size bytes within @batch.
 *
 * Returns: Offset within @batch between allocated subdata and base of @batch.
 */
void *
intel_batchbuffer_subdata_alloc(struct intel_batchbuffer *batch, uint32_t size,
				uint32_t align)
{
	uint32_t offset = intel_batchbuffer_align(batch, align);

	igt_assert(size <= intel_batchbuffer_space(batch));

	batch->ptr += size;
	return memset(batch->buffer + offset, 0, size);
}

/**
 * intel_batchbuffer_subdata_offset:
 * @batch: batchbuffer object
 * @ptr: pointer to given data
 *
 * Returns: Offset within @batch between @ptr and base of @batch.
 */
uint32_t
intel_batchbuffer_subdata_offset(struct intel_batchbuffer *batch, void *ptr)
{
	return (uint8_t *)ptr - batch->buffer;
}

/**
 * intel_batchbuffer_reset:
 * @batch: batchbuffer object
 *
 * Resets @batch by allocating a new gem buffer object as backing storage.
 */
void
intel_batchbuffer_reset(struct intel_batchbuffer *batch)
{
	if (batch->bo != NULL) {
		drm_intel_bo_unreference(batch->bo);
		batch->bo = NULL;
	}

	batch->bo = drm_intel_bo_alloc(batch->bufmgr, "batchbuffer",
				       BATCH_SZ, 4096);

	memset(batch->buffer, 0, sizeof(batch->buffer));
	batch->ctx = NULL;

	batch->ptr = batch->buffer;
	batch->end = NULL;
}

/**
 * intel_batchbuffer_alloc:
 * @bufmgr: libdrm buffer manager
 * @devid: pci device id of the drm device
 *
 * Allocates a new batchbuffer object. @devid must be supplied since libdrm
 * doesn't expose it directly.
 *
 * Returns: The allocated and initialized batchbuffer object.
 */
struct intel_batchbuffer *
intel_batchbuffer_alloc(drm_intel_bufmgr *bufmgr, uint32_t devid)
{
	struct intel_batchbuffer *batch = calloc(sizeof(*batch), 1);

	batch->bufmgr = bufmgr;
	batch->devid = devid;
	batch->gen = intel_gen(devid);
	intel_batchbuffer_reset(batch);

	return batch;
}

/**
 * intel_batchbuffer_free:
 * @batch: batchbuffer object
 *
 * Releases all resource of the batchbuffer object @batch.
 */
void
intel_batchbuffer_free(struct intel_batchbuffer *batch)
{
	drm_intel_bo_unreference(batch->bo);
	batch->bo = NULL;
	free(batch);
}

#define CMD_POLY_STIPPLE_OFFSET       0x7906

static unsigned int
flush_on_ring_common(struct intel_batchbuffer *batch, int ring)
{
	unsigned int used = batch->ptr - batch->buffer;

	if (used == 0)
		return 0;

	if (IS_GEN5(batch->devid)) {
		/* emit gen5 w/a without batch space checks - we reserve that
		 * already. */
		*(uint32_t *) (batch->ptr) = CMD_POLY_STIPPLE_OFFSET << 16;
		batch->ptr += 4;
		*(uint32_t *) (batch->ptr) = 0;
		batch->ptr += 4;
	}

	/* Round batchbuffer usage to 2 DWORDs. */
	if ((used & 4) == 0) {
		*(uint32_t *) (batch->ptr) = 0; /* noop */
		batch->ptr += 4;
	}

	/* Mark the end of the buffer. */
	*(uint32_t *)(batch->ptr) = MI_BATCH_BUFFER_END; /* noop */
	batch->ptr += 4;
	return batch->ptr - batch->buffer;
}

/**
 * intel_batchbuffer_flush_on_ring:
 * @batch: batchbuffer object
 * @ring: execbuf ring flag
 *
 * Submits the batch for execution on @ring.
 */
void
intel_batchbuffer_flush_on_ring(struct intel_batchbuffer *batch, int ring)
{
	unsigned int used = flush_on_ring_common(batch, ring);
	drm_intel_context *ctx;

	if (used == 0)
		return;

	do_or_die(drm_intel_bo_subdata(batch->bo, 0, used, batch->buffer));

	batch->ptr = NULL;

	/* XXX bad kernel API */
	ctx = batch->ctx;
	if (ring != I915_EXEC_RENDER)
		ctx = NULL;
	do_or_die(drm_intel_gem_bo_context_exec(batch->bo, ctx, used, ring));

	intel_batchbuffer_reset(batch);
}

void
intel_batchbuffer_set_context(struct intel_batchbuffer *batch,
				     drm_intel_context *context)
{
	batch->ctx = context;
}

/**
 * intel_batchbuffer_flush_with_context:
 * @batch: batchbuffer object
 * @context: libdrm hardware context object
 *
 * Submits the batch for execution on the render engine with the supplied
 * hardware context.
 */
void
intel_batchbuffer_flush_with_context(struct intel_batchbuffer *batch,
				     drm_intel_context *context)
{
	int ret;
	unsigned int used = flush_on_ring_common(batch, I915_EXEC_RENDER);

	if (used == 0)
		return;

	ret = drm_intel_bo_subdata(batch->bo, 0, used, batch->buffer);
	igt_assert(ret == 0);

	batch->ptr = NULL;

	ret = drm_intel_gem_bo_context_exec(batch->bo, context, used,
					    I915_EXEC_RENDER);
	igt_assert(ret == 0);

	intel_batchbuffer_reset(batch);
}

/**
 * intel_batchbuffer_flush:
 * @batch: batchbuffer object
 *
 * Submits the batch for execution on the blitter engine, selecting the right
 * ring depending upon the hardware platform.
 */
void
intel_batchbuffer_flush(struct intel_batchbuffer *batch)
{
	int ring = 0;
	if (HAS_BLT_RING(batch->devid))
		ring = I915_EXEC_BLT;
	intel_batchbuffer_flush_on_ring(batch, ring);
}


/**
 * intel_batchbuffer_emit_reloc:
 * @batch: batchbuffer object
 * @buffer: relocation target libdrm buffer object
 * @delta: delta value to add to @buffer's gpu address
 * @read_domains: gem domain bits for the relocation
 * @write_domain: gem domain bit for the relocation
 * @fenced: whether this gpu access requires fences
 *
 * Emits both a libdrm relocation entry pointing at @buffer and the pre-computed
 * DWORD of @batch's presumed gpu address plus the supplied @delta into @batch.
 *
 * Note that @fenced is only relevant if @buffer is actually tiled.
 *
 * This is the only way buffers get added to the validate list.
 */
void
intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
                             drm_intel_bo *buffer, uint64_t delta,
			     uint32_t read_domains, uint32_t write_domain,
			     int fenced)
{
	uint64_t offset;
	int ret;

	if (batch->ptr - batch->buffer > BATCH_SZ)
		igt_info("bad relocation ptr %p map %p offset %d size %d\n",
			 batch->ptr, batch->buffer,
			 (int)(batch->ptr - batch->buffer), BATCH_SZ);

	if (fenced)
		ret = drm_intel_bo_emit_reloc_fence(batch->bo, batch->ptr - batch->buffer,
						    buffer, delta,
						    read_domains, write_domain);
	else
		ret = drm_intel_bo_emit_reloc(batch->bo, batch->ptr - batch->buffer,
					      buffer, delta,
					      read_domains, write_domain);

	offset = buffer->offset64;
	offset += delta;
	intel_batchbuffer_emit_dword(batch, offset);
	if (batch->gen >= 8)
		intel_batchbuffer_emit_dword(batch, offset >> 32);
	igt_assert(ret == 0);
}

/**
 * intel_batchbuffer_copy_data:
 * @batch: batchbuffer object
 * @data: pointer to the data to write into the batchbuffer
 * @bytes: number of bytes to write into the batchbuffer
 * @align: value in bytes to which we want to align
 *
 * This transfers the given @data into the batchbuffer. Note that the length
 * must be DWORD aligned, i.e. multiples of 32bits. The caller must
 * confirm that there is enough space in the batch for the data to be
 * copied.
 *
 * Returns: Offset of copied data.
 */
uint32_t
intel_batchbuffer_copy_data(struct intel_batchbuffer *batch,
			    const void *data, unsigned int bytes,
			    uint32_t align)
{
	uint32_t *subdata;

	igt_assert((bytes & 3) == 0);
	subdata = intel_batchbuffer_subdata_alloc(batch, bytes, align);
	memcpy(subdata, data, bytes);

	return intel_batchbuffer_subdata_offset(batch, subdata);
}

#define CHECK_RANGE(x)	do { \
	igt_assert_lte(0, (x)); \
	igt_assert_lt((x), (1 << 15)); \
} while (0)

/**
 * intel_blt_copy:
 * @batch: batchbuffer object
 * @src_bo: source libdrm buffer object
 * @src_x1: source pixel x-coordination
 * @src_y1: source pixel y-coordination
 * @src_pitch: @src_bo's pitch in bytes
 * @dst_bo: destination libdrm buffer object
 * @dst_x1: destination pixel x-coordination
 * @dst_y1: destination pixel y-coordination
 * @dst_pitch: @dst_bo's pitch in bytes
 * @width: width of the copied rectangle
 * @height: height of the copied rectangle
 * @bpp: bits per pixel
 *
 * This emits a 2D copy operation using blitter commands into the supplied batch
 * buffer object.
 */
void
intel_blt_copy(struct intel_batchbuffer *batch,
	       drm_intel_bo *src_bo, int src_x1, int src_y1, int src_pitch,
	       drm_intel_bo *dst_bo, int dst_x1, int dst_y1, int dst_pitch,
	       int width, int height, int bpp)
{
	const int gen = batch->gen;
	uint32_t src_tiling, dst_tiling, swizzle;
	uint32_t cmd_bits = 0;
	uint32_t br13_bits;

	igt_assert(bpp*(src_x1 + width) <= 8*src_pitch);
	igt_assert(bpp*(dst_x1 + width) <= 8*dst_pitch);
	igt_assert(src_pitch * (src_y1 + height) <= src_bo->size);
	igt_assert(dst_pitch * (dst_y1 + height) <= dst_bo->size);

	drm_intel_bo_get_tiling(src_bo, &src_tiling, &swizzle);
	drm_intel_bo_get_tiling(dst_bo, &dst_tiling, &swizzle);

	if (gen >= 4 && src_tiling != I915_TILING_NONE) {
		src_pitch /= 4;
		cmd_bits |= XY_SRC_COPY_BLT_SRC_TILED;
	}

	if (gen >= 4 && dst_tiling != I915_TILING_NONE) {
		dst_pitch /= 4;
		cmd_bits |= XY_SRC_COPY_BLT_DST_TILED;
	}

	CHECK_RANGE(src_x1); CHECK_RANGE(src_y1);
	CHECK_RANGE(dst_x1); CHECK_RANGE(dst_y1);
	CHECK_RANGE(width); CHECK_RANGE(height);
	CHECK_RANGE(src_x1 + width); CHECK_RANGE(src_y1 + height);
	CHECK_RANGE(dst_x1 + width); CHECK_RANGE(dst_y1 + height);
	CHECK_RANGE(src_pitch); CHECK_RANGE(dst_pitch);

	br13_bits = 0;
	switch (bpp) {
	case 8:
		break;
	case 16:		/* supporting only RGB565, not ARGB1555 */
		br13_bits |= 1 << 24;
		break;
	case 32:
		br13_bits |= 3 << 24;
		cmd_bits |= XY_SRC_COPY_BLT_WRITE_ALPHA |
			    XY_SRC_COPY_BLT_WRITE_RGB;
		break;
	default:
		igt_fail(IGT_EXIT_FAILURE);
	}

	BLIT_COPY_BATCH_START(cmd_bits);
	OUT_BATCH((br13_bits) |
		  (0xcc << 16) | /* copy ROP */
		  dst_pitch);
	OUT_BATCH((dst_y1 << 16) | dst_x1); /* dst x1,y1 */
	OUT_BATCH(((dst_y1 + height) << 16) | (dst_x1 + width)); /* dst x2,y2 */
	OUT_RELOC_FENCED(dst_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
	OUT_BATCH((src_y1 << 16) | src_x1); /* src x1,y1 */
	OUT_BATCH(src_pitch);
	OUT_RELOC_FENCED(src_bo, I915_GEM_DOMAIN_RENDER, 0, 0);
	ADVANCE_BATCH();

#define CMD_POLY_STIPPLE_OFFSET       0x7906
	if (gen == 5) {
		BEGIN_BATCH(2, 0);
		OUT_BATCH(CMD_POLY_STIPPLE_OFFSET << 16);
		OUT_BATCH(0);
		ADVANCE_BATCH();
	}

	if (gen >= 6 && src_bo == dst_bo) {
		BEGIN_BATCH(3, 0);
		OUT_BATCH(XY_SETUP_CLIP_BLT_CMD);
		OUT_BATCH(0);
		OUT_BATCH(0);
		ADVANCE_BATCH();
	}

	intel_batchbuffer_flush(batch);
}

/**
 * intel_copy_bo:
 * @batch: batchbuffer object
 * @src_bo: source libdrm buffer object
 * @dst_bo: destination libdrm buffer object
 * @size: size of the copy range in bytes
 *
 * This emits a copy operation using blitter commands into the supplied batch
 * buffer object. A total of @size bytes from the start of @src_bo is copied
 * over to @dst_bo. Note that @size must be page-aligned.
 */
void
intel_copy_bo(struct intel_batchbuffer *batch,
	      drm_intel_bo *dst_bo, drm_intel_bo *src_bo,
	      long int size)
{
	igt_assert(size % 4096 == 0);

	intel_blt_copy(batch,
		       src_bo, 0, 0, 4096,
		       dst_bo, 0, 0, 4096,
		       4096/4, size/4096, 32);
}

/**
 * igt_buf_width:
 * @buf: the i-g-t buffer object
 *
 * Computes the width in 32-bit pixels of the given buffer.
 *
 * Returns:
 * The width of the buffer.
 */
unsigned igt_buf_width(const struct igt_buf *buf)
{
	return buf->surface[0].stride/(buf->bpp / 8);
}

/**
 * igt_buf_height:
 * @buf: the i-g-t buffer object
 *
 * Computes the height in 32-bit pixels of the given buffer.
 *
 * Returns:
 * The height of the buffer.
 */
unsigned igt_buf_height(const struct igt_buf *buf)
{
	return buf->surface[0].size/buf->surface[0].stride;
}

/**
 * igt_buf_intel_ccs_width:
 * @buf: the Intel i-g-t buffer object
 * @gen: device generation
 *
 * Computes the width of ccs buffer when considered as Intel surface data.
 *
 * Returns:
 * The width of the ccs buffer data.
 */
unsigned int igt_buf_intel_ccs_width(int gen, const struct igt_buf *buf)
{
	/*
	 * GEN12+: The CCS unit size is 64 bytes mapping 4 main surface
	 * tiles. Thus the width of the CCS unit is 4*32=128 pixels on the
	 * main surface.
	 */
	if (gen >= 12)
		return DIV_ROUND_UP(igt_buf_width(buf), 128) * 64;

	return DIV_ROUND_UP(igt_buf_width(buf), 1024) * 128;
}

/**
 * igt_buf_intel_ccs_height:
 * @buf: the i-g-t buffer object
 * @gen: device generation
 *
 * Computes the height of ccs buffer when considered as Intel surface data.
 *
 * Returns:
 * The height of the ccs buffer data.
 */
unsigned int igt_buf_intel_ccs_height(int gen, const struct igt_buf *buf)
{
	/*
	 * GEN12+: The CCS unit size is 64 bytes mapping 4 main surface
	 * tiles. Thus the height of the CCS unit is 32 pixel rows on the main
	 * surface.
	 */
	if (gen >= 12)
		return DIV_ROUND_UP(igt_buf_height(buf), 32);

	return DIV_ROUND_UP(igt_buf_height(buf), 512) * 32;
}

/*
 * pitches are in bytes if the surfaces are linear, number of dwords
 * otherwise
 */
static uint32_t fast_copy_pitch(unsigned int stride, unsigned int tiling)
{
	if (tiling != I915_TILING_NONE)
		return stride / 4;
	else
		return stride;
}

static uint32_t fast_copy_dword0(unsigned int src_tiling,
				 unsigned int dst_tiling)
{
	uint32_t dword0 = 0;

	dword0 |= XY_FAST_COPY_BLT;

	switch (src_tiling) {
	case I915_TILING_X:
		dword0 |= XY_FAST_COPY_SRC_TILING_X;
		break;
	case I915_TILING_Y:
	case I915_TILING_Yf:
		dword0 |= XY_FAST_COPY_SRC_TILING_Yb_Yf;
		break;
	case I915_TILING_Ys:
		dword0 |= XY_FAST_COPY_SRC_TILING_Ys;
		break;
	case I915_TILING_NONE:
	default:
		break;
	}

	switch (dst_tiling) {
	case I915_TILING_X:
		dword0 |= XY_FAST_COPY_DST_TILING_X;
		break;
	case I915_TILING_Y:
	case I915_TILING_Yf:
		dword0 |= XY_FAST_COPY_DST_TILING_Yb_Yf;
		break;
	case I915_TILING_Ys:
		dword0 |= XY_FAST_COPY_DST_TILING_Ys;
		break;
	case I915_TILING_NONE:
	default:
		break;
	}

	return dword0;
}

static uint32_t fast_copy_dword1(unsigned int src_tiling,
				 unsigned int dst_tiling,
				 int bpp)
{
	uint32_t dword1 = 0;

	if (src_tiling == I915_TILING_Yf)
		dword1 |= XY_FAST_COPY_SRC_TILING_Yf;
	if (dst_tiling == I915_TILING_Yf)
		dword1 |= XY_FAST_COPY_DST_TILING_Yf;

	switch (bpp) {
	case 8:
		dword1 |= XY_FAST_COPY_COLOR_DEPTH_8;
		break;
	case 16:
		dword1 |= XY_FAST_COPY_COLOR_DEPTH_16;
		break;
	case 32:
		dword1 |= XY_FAST_COPY_COLOR_DEPTH_32;
		break;
	case 64:
		dword1 |= XY_FAST_COPY_COLOR_DEPTH_64;
		break;
	case 128:
		dword1 |= XY_FAST_COPY_COLOR_DEPTH_128;
		break;
	default:
		igt_assert(0);
	}

	return dword1;
}

static void
fill_relocation(struct drm_i915_gem_relocation_entry *reloc,
		uint32_t gem_handle, uint32_t delta, /* in bytes */
		uint32_t offset, /* in dwords */
		uint32_t read_domains, uint32_t write_domains)
{
	reloc->target_handle = gem_handle;
	reloc->delta = delta;
	reloc->offset = offset * sizeof(uint32_t);
	reloc->presumed_offset = 0;
	reloc->read_domains = read_domains;
	reloc->write_domain = write_domains;
}

static void
fill_object(struct drm_i915_gem_exec_object2 *obj, uint32_t gem_handle,
	    struct drm_i915_gem_relocation_entry *relocs, uint32_t count)
{
	memset(obj, 0, sizeof(*obj));
	obj->handle = gem_handle;
	obj->relocation_count = count;
	obj->relocs_ptr = to_user_pointer(relocs);
}

static void exec_blit(int fd,
		      struct drm_i915_gem_exec_object2 *objs, uint32_t count,
		      int gen)
{
	struct drm_i915_gem_execbuffer2 exec = {
		.buffers_ptr = to_user_pointer(objs),
		.buffer_count = count,
		.flags = gen >= 6 ? I915_EXEC_BLT : 0,
	};

	gem_execbuf(fd, &exec);
}

static uint32_t src_copy_dword0(uint32_t src_tiling, uint32_t dst_tiling,
				uint32_t bpp, uint32_t device_gen)
{
	uint32_t dword0 = 0;

	dword0 |= XY_SRC_COPY_BLT_CMD;
	if (bpp == 32)
		dword0 |= XY_SRC_COPY_BLT_WRITE_RGB |
			XY_SRC_COPY_BLT_WRITE_ALPHA;

	if (device_gen >= 4 && src_tiling)
		dword0 |= XY_SRC_COPY_BLT_SRC_TILED;
	if (device_gen >= 4 && dst_tiling)
		dword0 |= XY_SRC_COPY_BLT_DST_TILED;

	return dword0;
}

static uint32_t src_copy_dword1(uint32_t dst_pitch, uint32_t bpp)
{
	uint32_t dword1 = 0;

	switch (bpp) {
	case 8:
		break;
	case 16:
		dword1 |= 1 << 24; /* Only support 565 color */
		break;
	case 32:
		dword1 |= 3 << 24;
		break;
	default:
		igt_assert(0);
	}

	dword1 |= 0xcc << 16;
	dword1 |= dst_pitch;

	return dword1;
}

/**
 * igt_blitter_src_copy:
 * @fd: file descriptor of the i915 driver
 * @src_handle: GEM handle of the source buffer
 * @src_delta: offset into the source GEM bo, in bytes
 * @src_stride: Stride (in bytes) of the source buffer
 * @src_tiling: Tiling mode of the source buffer
 * @src_x: X coordinate of the source region to copy
 * @src_y: Y coordinate of the source region to copy
 * @width: Width of the region to copy
 * @height: Height of the region to copy
 * @bpp: source and destination bits per pixel
 * @dst_handle: GEM handle of the destination buffer
 * @dst_delta: offset into the destination GEM bo, in bytes
 * @dst_stride: Stride (in bytes) of the destination buffer
 * @dst_tiling: Tiling mode of the destination buffer
 * @dst_x: X coordinate of destination
 * @dst_y: Y coordinate of destination
 *
 * Copy @src into @dst using the XY_SRC blit command.
 */
void igt_blitter_src_copy(int fd,
			  /* src */
			  uint32_t src_handle,
			  uint32_t src_delta,
			  uint32_t src_stride,
			  uint32_t src_tiling,
			  uint32_t src_x, uint32_t src_y,

			  /* size */
			  uint32_t width, uint32_t height,

			  /* bpp */
			  uint32_t bpp,

			  /* dst */
			  uint32_t dst_handle,
			  uint32_t dst_delta,
			  uint32_t dst_stride,
			  uint32_t dst_tiling,
			  uint32_t dst_x, uint32_t dst_y)
{
	uint32_t batch[32];
	struct drm_i915_gem_exec_object2 objs[3];
	struct drm_i915_gem_relocation_entry relocs[2];
	uint32_t batch_handle;
	uint32_t src_pitch, dst_pitch;
	uint32_t dst_reloc_offset, src_reloc_offset;
	uint32_t gen = intel_gen(intel_get_drm_devid(fd));
	const bool has_64b_reloc = gen >= 8;
	int i = 0;

	memset(batch, 0, sizeof(batch));

	igt_assert((src_tiling == I915_TILING_NONE) ||
		   (src_tiling == I915_TILING_X) ||
		   (src_tiling == I915_TILING_Y));
	igt_assert((dst_tiling == I915_TILING_NONE) ||
		   (dst_tiling == I915_TILING_X) ||
		   (dst_tiling == I915_TILING_Y));

	src_pitch = (gen >= 4 && src_tiling) ? src_stride / 4 : src_stride;
	dst_pitch = (gen >= 4 && dst_tiling) ? dst_stride / 4 : dst_stride;

	if (bpp == 64) {
		bpp /= 2;
		width *= 2;
	}

	CHECK_RANGE(src_x); CHECK_RANGE(src_y);
	CHECK_RANGE(dst_x); CHECK_RANGE(dst_y);
	CHECK_RANGE(width); CHECK_RANGE(height);
	CHECK_RANGE(src_x + width); CHECK_RANGE(src_y + height);
	CHECK_RANGE(dst_x + width); CHECK_RANGE(dst_y + height);
	CHECK_RANGE(src_pitch); CHECK_RANGE(dst_pitch);

	if ((src_tiling | dst_tiling) >= I915_TILING_Y) {
		unsigned int mask;

		batch[i++] = MI_LOAD_REGISTER_IMM;
		batch[i++] = BCS_SWCTRL;

		mask = (BCS_SRC_Y | BCS_DST_Y) << 16;
		if (src_tiling == I915_TILING_Y)
			mask |= BCS_SRC_Y;
		if (dst_tiling == I915_TILING_Y)
			mask |= BCS_DST_Y;
		batch[i++] = mask;
	}

	batch[i] = src_copy_dword0(src_tiling, dst_tiling, bpp, gen);
	batch[i++] |= 6 + 2 * has_64b_reloc;
	batch[i++] = src_copy_dword1(dst_pitch, bpp);
	batch[i++] = (dst_y << 16) | dst_x; /* dst x1,y1 */
	batch[i++] = ((dst_y + height) << 16) | (dst_x + width); /* dst x2,y2 */
	dst_reloc_offset = i;
	batch[i++] = dst_delta; /* dst address lower bits */
	if (has_64b_reloc)
		batch[i++] = 0;	/* dst address upper bits */
	batch[i++] = (src_y << 16) | src_x; /* src x1,y1 */
	batch[i++] = src_pitch;
	src_reloc_offset = i;
	batch[i++] = src_delta; /* src address lower bits */
	if (has_64b_reloc)
		batch[i++] = 0;	/* src address upper bits */

	if ((src_tiling | dst_tiling) >= I915_TILING_Y) {
		igt_assert(gen >= 6);
		batch[i++] = MI_FLUSH_DW | 2;
		batch[i++] = 0;
		batch[i++] = 0;
		batch[i++] = 0;

		batch[i++] = MI_LOAD_REGISTER_IMM;
		batch[i++] = BCS_SWCTRL;
		batch[i++] = (BCS_SRC_Y | BCS_DST_Y) << 16;
	}

	batch[i++] = MI_BATCH_BUFFER_END;
	batch[i++] = MI_NOOP;

	igt_assert(i <= ARRAY_SIZE(batch));

	batch_handle = gem_create(fd, 4096);
	gem_write(fd, batch_handle, 0, batch, sizeof(batch));

	fill_relocation(&relocs[0], dst_handle, dst_delta, dst_reloc_offset,
			I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
	fill_relocation(&relocs[1], src_handle, src_delta, src_reloc_offset,
			I915_GEM_DOMAIN_RENDER, 0);

	fill_object(&objs[0], dst_handle, NULL, 0);
	fill_object(&objs[1], src_handle, NULL, 0);
	fill_object(&objs[2], batch_handle, relocs, 2);

	objs[0].flags |= EXEC_OBJECT_NEEDS_FENCE;
	objs[1].flags |= EXEC_OBJECT_NEEDS_FENCE;

	exec_blit(fd, objs, 3, gen);

	gem_close(fd, batch_handle);
}

/**
 * igt_blitter_fast_copy__raw:
 * @fd: file descriptor of the i915 driver
 * @src_handle: GEM handle of the source buffer
 * @src_delta: offset into the source GEM bo, in bytes
 * @src_stride: Stride (in bytes) of the source buffer
 * @src_tiling: Tiling mode of the source buffer
 * @src_x: X coordinate of the source region to copy
 * @src_y: Y coordinate of the source region to copy
 * @width: Width of the region to copy
 * @height: Height of the region to copy
 * @bpp: source and destination bits per pixel
 * @dst_handle: GEM handle of the destination buffer
 * @dst_delta: offset into the destination GEM bo, in bytes
 * @dst_stride: Stride (in bytes) of the destination buffer
 * @dst_tiling: Tiling mode of the destination buffer
 * @dst_x: X coordinate of destination
 * @dst_y: Y coordinate of destination
 *
 * Like igt_blitter_fast_copy(), but talking to the kernel directly.
 */
void igt_blitter_fast_copy__raw(int fd,
				/* src */
				uint32_t src_handle,
				unsigned int src_delta,
				unsigned int src_stride,
				unsigned int src_tiling,
				unsigned int src_x, unsigned src_y,

				/* size */
				unsigned int width, unsigned int height,

				/* bpp */
				int bpp,

				/* dst */
				uint32_t dst_handle,
				unsigned dst_delta,
				unsigned int dst_stride,
				unsigned int dst_tiling,
				unsigned int dst_x, unsigned dst_y)
{
	uint32_t batch[12];
	struct drm_i915_gem_exec_object2 objs[3];
	struct drm_i915_gem_relocation_entry relocs[2];
	uint32_t batch_handle;
	uint32_t dword0, dword1;
	uint32_t src_pitch, dst_pitch;
	int i = 0;

	src_pitch = fast_copy_pitch(src_stride, src_tiling);
	dst_pitch = fast_copy_pitch(dst_stride, dst_tiling);
	dword0 = fast_copy_dword0(src_tiling, dst_tiling);
	dword1 = fast_copy_dword1(src_tiling, dst_tiling, bpp);

	CHECK_RANGE(src_x); CHECK_RANGE(src_y);
	CHECK_RANGE(dst_x); CHECK_RANGE(dst_y);
	CHECK_RANGE(width); CHECK_RANGE(height);
	CHECK_RANGE(src_x + width); CHECK_RANGE(src_y + height);
	CHECK_RANGE(dst_x + width); CHECK_RANGE(dst_y + height);
	CHECK_RANGE(src_pitch); CHECK_RANGE(dst_pitch);

	batch[i++] = dword0;
	batch[i++] = dword1 | dst_pitch;
	batch[i++] = (dst_y << 16) | dst_x; /* dst x1,y1 */
	batch[i++] = ((dst_y + height) << 16) | (dst_x + width); /* dst x2,y2 */
	batch[i++] = dst_delta; /* dst address lower bits */
	batch[i++] = 0;	/* dst address upper bits */
	batch[i++] = (src_y << 16) | src_x; /* src x1,y1 */
	batch[i++] = src_pitch;
	batch[i++] = src_delta; /* src address lower bits */
	batch[i++] = 0;	/* src address upper bits */
	batch[i++] = MI_BATCH_BUFFER_END;
	batch[i++] = MI_NOOP;

	igt_assert(i == ARRAY_SIZE(batch));

	batch_handle = gem_create(fd, 4096);
	gem_write(fd, batch_handle, 0, batch, sizeof(batch));

	fill_relocation(&relocs[0], dst_handle, dst_delta, 4,
			I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
	fill_relocation(&relocs[1], src_handle, src_delta, 8, I915_GEM_DOMAIN_RENDER, 0);

	fill_object(&objs[0], dst_handle, NULL, 0);
	fill_object(&objs[1], src_handle, NULL, 0);
	fill_object(&objs[2], batch_handle, relocs, 2);

	exec_blit(fd, objs, 3, intel_gen(intel_get_drm_devid(fd)));

	gem_close(fd, batch_handle);
}

/**
 * igt_blitter_fast_copy:
 * @batch: batchbuffer object
 * @src: source i-g-t buffer object
 * @src_delta: offset into the source i-g-t bo
 * @src_x: source pixel x-coordination
 * @src_y: source pixel y-coordination
 * @width: width of the copied rectangle
 * @height: height of the copied rectangle
 * @dst: destination i-g-t buffer object
 * @dst_delta: offset into the destination i-g-t bo
 * @dst_x: destination pixel x-coordination
 * @dst_y: destination pixel y-coordination
 *
 * Copy @src into @dst using the gen9 fast copy blitter command.
 *
 * The source and destination surfaces cannot overlap.
 */
void igt_blitter_fast_copy(struct intel_batchbuffer *batch,
			   const struct igt_buf *src, unsigned src_delta,
			   unsigned src_x, unsigned src_y,
			   unsigned width, unsigned height,
			   int bpp,
			   const struct igt_buf *dst, unsigned dst_delta,
			   unsigned dst_x, unsigned dst_y)
{
	uint32_t src_pitch, dst_pitch;
	uint32_t dword0, dword1;

	igt_assert(src->bpp == dst->bpp);

	src_pitch = fast_copy_pitch(src->surface[0].stride, src->tiling);
	dst_pitch = fast_copy_pitch(dst->surface[0].stride, src->tiling);
	dword0 = fast_copy_dword0(src->tiling, dst->tiling);
	dword1 = fast_copy_dword1(src->tiling, dst->tiling, dst->bpp);

	CHECK_RANGE(src_x); CHECK_RANGE(src_y);
	CHECK_RANGE(dst_x); CHECK_RANGE(dst_y);
	CHECK_RANGE(width); CHECK_RANGE(height);
	CHECK_RANGE(src_x + width); CHECK_RANGE(src_y + height);
	CHECK_RANGE(dst_x + width); CHECK_RANGE(dst_y + height);
	CHECK_RANGE(src_pitch); CHECK_RANGE(dst_pitch);

	BEGIN_BATCH(10, 2);
	OUT_BATCH(dword0);
	OUT_BATCH(dword1 | dst_pitch);
	OUT_BATCH((dst_y << 16) | dst_x); /* dst x1,y1 */
	OUT_BATCH(((dst_y + height) << 16) | (dst_x + width)); /* dst x2,y2 */
	OUT_RELOC(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, dst_delta);
	OUT_BATCH(0);	/* dst address upper bits */
	OUT_BATCH((src_y << 16) | src_x); /* src x1,y1 */
	OUT_BATCH(src_pitch);
	OUT_RELOC(src->bo, I915_GEM_DOMAIN_RENDER, 0, src_delta);
	OUT_BATCH(0);	/* src address upper bits */
	ADVANCE_BATCH();

	intel_batchbuffer_flush(batch);
}

/**
 * igt_get_render_copyfunc:
 * @devid: pci device id
 *
 * Returns:
 *
 * The platform-specific render copy function pointer for the device
 * specified with @devid. Will return NULL when no render copy function is
 * implemented.
 */
igt_render_copyfunc_t igt_get_render_copyfunc(int devid)
{
	igt_render_copyfunc_t copy = NULL;

	if (IS_GEN2(devid))
		copy = gen2_render_copyfunc;
	else if (IS_GEN3(devid))
		copy = gen3_render_copyfunc;
	else if (IS_GEN4(devid) || IS_GEN5(devid))
		copy = gen4_render_copyfunc;
	else if (IS_GEN6(devid))
		copy = gen6_render_copyfunc;
	else if (IS_GEN7(devid))
		copy = gen7_render_copyfunc;
	else if (IS_GEN8(devid))
		copy = gen8_render_copyfunc;
	else if (IS_GEN9(devid) || IS_GEN10(devid))
		copy = gen9_render_copyfunc;
	else if (IS_GEN11(devid))
		copy = gen11_render_copyfunc;
	else if (IS_GEN12(devid))
		copy = gen12_render_copyfunc;

	return copy;
}

igt_vebox_copyfunc_t igt_get_vebox_copyfunc(int devid)
{
	igt_vebox_copyfunc_t copy = NULL;

	if (IS_GEN12(devid))
		copy = gen12_vebox_copyfunc;

	return copy;
}

/**
 * igt_get_media_fillfunc:
 * @devid: pci device id
 *
 * Returns:
 *
 * The platform-specific media fill function pointer for the device specified
 * with @devid. Will return NULL when no media fill function is implemented.
 */
igt_fillfunc_t igt_get_media_fillfunc(int devid)
{
	igt_fillfunc_t fill = NULL;

	if (IS_GEN12(devid))
		fill = gen12_media_fillfunc;
	else if (IS_GEN9(devid) || IS_GEN10(devid) || IS_GEN11(devid))
		fill = gen9_media_fillfunc;
	else if (IS_GEN8(devid))
		fill = gen8_media_fillfunc;
	else if (IS_GEN7(devid))
		fill = gen7_media_fillfunc;

	return fill;
}

igt_vme_func_t igt_get_media_vme_func(int devid)
{
	igt_vme_func_t fill = NULL;

	if (IS_GEN11(devid))
		fill = gen11_media_vme_func;

	return fill;
}

/**
 * igt_get_gpgpu_fillfunc:
 * @devid: pci device id
 *
 * Returns:
 *
 * The platform-specific gpgpu fill function pointer for the device specified
 * with @devid. Will return NULL when no gpgpu fill function is implemented.
 */
igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid)
{
	igt_fillfunc_t fill = NULL;

	if (IS_GEN7(devid))
		fill = gen7_gpgpu_fillfunc;
	else if (IS_GEN8(devid))
		fill = gen8_gpgpu_fillfunc;
	else if (IS_GEN9(devid) || IS_GEN10(devid))
		fill = gen9_gpgpu_fillfunc;
	else if (IS_GEN11(devid))
		fill = gen11_gpgpu_fillfunc;
	else if (IS_GEN12(devid))
		fill = gen12_gpgpu_fillfunc;

	return fill;
}

/**
 * igt_get_media_spinfunc:
 * @devid: pci device id
 *
 * Returns:
 *
 * The platform-specific media spin function pointer for the device specified
 * with @devid. Will return NULL when no media spin function is implemented.
 */
igt_media_spinfunc_t igt_get_media_spinfunc(int devid)
{
	igt_media_spinfunc_t spin = NULL;

	if (IS_GEN9(devid))
		spin = gen9_media_spinfunc;
	else if (IS_GEN8(devid))
		spin = gen8_media_spinfunc;

	return spin;
}

/* Intel batchbuffer v2 */
static bool intel_bb_debug_tree = false;

/*
 * __reallocate_objects:
 * @ibb: pointer to intel_bb
 *
 * Increases number of objects if necessary.
 */
static void __reallocate_objects(struct intel_bb *ibb)
{
	uint32_t num;

	if (ibb->num_objects == ibb->allocated_objects) {
		num = 4096 / sizeof(*ibb->objects);
		ibb->objects = realloc(ibb->objects,
				       sizeof(*ibb->objects) *
				       (num + ibb->allocated_objects));

		igt_assert(ibb->objects);
		ibb->allocated_objects += num;

		memset(&ibb->objects[ibb->num_objects],	0,
		       num * sizeof(*ibb->objects));
	}
}

/*
 * gen8_canonical_addr
 * Used to convert any address into canonical form, i.e. [63:48] == [47].
 * Based on kernel's sign_extend64 implementation.
 * @address - a virtual address
 */
#define GEN8_HIGH_ADDRESS_BIT 47
static uint64_t gen8_canonical_addr(uint64_t address)
{
	int shift = 63 - GEN8_HIGH_ADDRESS_BIT;

	return (int64_t)(address << shift) >> shift;
}

static inline uint64_t __intel_bb_propose_offset(struct intel_bb *ibb)
{
	uint64_t offset;

	if (ibb->enforce_relocs)
		return 0;

	/* randomize the address, we try to avoid relocations */
	offset = hars_petruska_f54_1_random64(&ibb->prng);
	offset += 256 << 10; /* Keep the low 256k clear, for negative deltas */
	offset &= ibb->gtt_size - 1;
	offset &= ~(ibb->alignment - 1);
	offset = gen8_canonical_addr(offset);

	return offset;
}

/**
 * intel_bb_create:
 * @i915: drm fd
 * @size: size of the batchbuffer
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 */
static struct intel_bb *
__intel_bb_create(int i915, uint32_t size, bool do_relocs)
{
	struct intel_bb *ibb = calloc(1, sizeof(*ibb));
	uint64_t gtt_size;

	igt_assert(ibb);

	ibb->i915 = i915;
	ibb->devid = intel_get_drm_devid(i915);
	ibb->gen = intel_gen(ibb->devid);
	ibb->enforce_relocs = do_relocs;
	ibb->handle = gem_create(i915, size);
	ibb->size = size;
	ibb->alignment = 4096;
	ibb->batch = calloc(1, size);
	igt_assert(ibb->batch);
	ibb->ptr = ibb->batch;
	ibb->prng = (uint32_t) to_user_pointer(ibb);
	ibb->fence = -1;

	gtt_size = gem_aperture_size(i915);
	if (!gem_uses_full_ppgtt(i915))
		gtt_size /= 2;
	if ((gtt_size - 1) >> 32) {
		ibb->supports_48b_address = true;

		/*
		 * Until we develop IGT address allocator we workaround
		 * playing with canonical addresses with 47-bit set to 1
		 * just by limiting gtt size to 46-bit when gtt is 47 or 48
		 * bit size. Current interface doesn't pass bo size, so
		 * limiting to 46 bit make us sure we won't enter to
		 * addresses with 47-bit set (we use 32-bit size now so
		 * still we fit 47-bit address space).
		 */
		if (gtt_size & (3ull << 47))
			gtt_size = (1ull << 46);
	}
	ibb->gtt_size = gtt_size;

	ibb->batch_offset = __intel_bb_propose_offset(ibb);
	intel_bb_add_object(ibb, ibb->handle, ibb->batch_offset, false);

	ibb->refcount = 1;

	return ibb;
}

/**
 * intel_bb_create:
 * @i915: drm fd
 * @size: size of the batchbuffer
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 */
struct intel_bb *intel_bb_create(int i915, uint32_t size)
{
	return __intel_bb_create(i915, size, false);
}

/**
 * intel_bb_create_with_relocs:
 * @i915: drm fd
 * @size: size of the batchbuffer
 *
 * Disable passing or randomizing addresses. This will lead to relocations
 * when objects are not previously pinned.
 *
 * Returns:
 *
 * Pointer the intel_bb, asserts on failure.
 */
struct intel_bb *intel_bb_create_with_relocs(int i915, uint32_t size)
{
	return __intel_bb_create(i915, size, true);
}

static void __intel_bb_destroy_relocations(struct intel_bb *ibb)
{
	uint32_t i;

	/* Free relocations */
	for (i = 0; i < ibb->num_objects; i++) {
		free(from_user_pointer(ibb->objects[i]->relocs_ptr));
		ibb->objects[i]->relocs_ptr = to_user_pointer(NULL);
		ibb->objects[i]->relocation_count = 0;
	}

	ibb->relocs = NULL;
	ibb->num_relocs = 0;
	ibb->allocated_relocs = 0;
}

static void __intel_bb_destroy_objects(struct intel_bb *ibb)
{
	free(ibb->objects);
	ibb->objects = NULL;

	tdestroy(ibb->current, free);
	ibb->current = NULL;

	ibb->num_objects = 0;
	ibb->allocated_objects = 0;
}

static void __intel_bb_destroy_cache(struct intel_bb *ibb)
{
	tdestroy(ibb->root, free);
	ibb->root = NULL;
}

/**
 * intel_bb_destroy:
 * @ibb: pointer to intel_bb
 *
 * Frees all relocations / objects allocated during filling the batch.
 */
void intel_bb_destroy(struct intel_bb *ibb)
{
	igt_assert(ibb);

	ibb->refcount--;
	igt_assert_f(ibb->refcount == 0, "Trying to destroy referenced bb!");

	__intel_bb_destroy_relocations(ibb);
	__intel_bb_destroy_objects(ibb);
	__intel_bb_destroy_cache(ibb);

	gem_close(ibb->i915, ibb->handle);

	if (ibb->fence >= 0)
		close(ibb->fence);

	free(ibb->batch);
	free(ibb);
}

/*
 * intel_bb_reset:
 * @ibb: pointer to intel_bb
 * @purge_objects_cache: if true destroy internal execobj and relocs + cache
 *
 * Recreate batch bo when there's no additional reference.
*/
void intel_bb_reset(struct intel_bb *ibb, bool purge_objects_cache)
{
	uint32_t i;

	if (purge_objects_cache && ibb->refcount > 1)
		igt_warn("Cannot purge objects cache on bb, refcount > 1!");

	/* Someone keeps reference, just exit */
	if (ibb->refcount > 1)
		return;

	/*
	 * To avoid relocation objects previously pinned to high virtual
	 * addresses should keep 48bit flag. Ensure we won't clear it
	 * in the reset path.
	 */
	for (i = 0; i < ibb->num_objects; i++)
		ibb->objects[i]->flags &= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;

	__intel_bb_destroy_relocations(ibb);
	__intel_bb_destroy_objects(ibb);
	__reallocate_objects(ibb);

	if (purge_objects_cache) {
		__intel_bb_destroy_cache(ibb);
		ibb->batch_offset = __intel_bb_propose_offset(ibb);
	} else {
		struct drm_i915_gem_exec_object2 *object;

		object = intel_bb_find_object(ibb, ibb->handle);
		ibb->batch_offset = object ? object->offset :
					     __intel_bb_propose_offset(ibb);
	}

	gem_close(ibb->i915, ibb->handle);
	ibb->handle = gem_create(ibb->i915, ibb->size);

	intel_bb_add_object(ibb, ibb->handle, ibb->batch_offset, false);
	ibb->ptr = ibb->batch;
	memset(ibb->batch, 0, ibb->size);
}

/*
 * intel_bb_sync:
 * @ibb: pointer to intel_bb
 *
 * Waits for bb completion. Returns 0 on success, otherwise errno.
 */
int intel_bb_sync(struct intel_bb *ibb)
{
	int ret;

	if (ibb->fence < 0)
		return 0;

	ret = sync_fence_wait(ibb->fence, -1);
	if (ret == 0) {
		close(ibb->fence);
		ibb->fence = -1;
	}

	return ret;
}

/*
 * intel_bb_print:
 * @ibb: pointer to intel_bb
 *
 * Prints batch to stdout.
 */
void intel_bb_print(struct intel_bb *ibb)
{
	igt_info("drm fd: %d, gen: %d, devid: %u, debug: %d\n",
		 ibb->i915, ibb->gen, ibb->devid, ibb->debug);
	igt_info("handle: %u, size: %u, batch: %p, ptr: %p\n",
		 ibb->handle, ibb->size, ibb->batch, ibb->ptr);
	igt_info("prng: %u, gtt_size: %" PRIu64 ", supports 48bit: %d\n",
		 ibb->prng, ibb->gtt_size, ibb->supports_48b_address);
	igt_info("ctx: %u\n", ibb->ctx);
	igt_info("root: %p\n", ibb->root);
	igt_info("objects: %p, num_objects: %u, allocated obj: %u\n",
		 ibb->objects, ibb->num_objects, ibb->allocated_objects);
	igt_info("relocs: %p, num_relocs: %u, allocated_relocs: %u\n----\n",
		 ibb->relocs, ibb->num_relocs, ibb->allocated_relocs);
}

/*
 * intel_bb_dump:
 * @ibb: pointer to intel_bb
 * @filename: name to which write bb
 *
 * Dump batch bo to file.
 */
void intel_bb_dump(struct intel_bb *ibb, const char *filename)
{
	FILE *out;
	void *ptr;

	ptr = gem_mmap__device_coherent(ibb->i915, ibb->handle, 0, ibb->size,
					PROT_READ);
	out = fopen(filename, "wb");
	igt_assert(out);
	fwrite(ptr, ibb->size, 1, out);
	fclose(out);
	munmap(ptr, ibb->size);
}

/**
 * intel_bb_set_debug:
 * @ibb: pointer to intel_bb
 * @debug: true / false
 *
 * Sets debug to true / false. Execbuf is then called synchronously and
 * object/reloc arrays are printed after execution.
 */
void intel_bb_set_debug(struct intel_bb *ibb, bool debug)
{
	ibb->debug = debug;
}

/**
 * intel_bb_set_dump_base64:
 * @ibb: pointer to intel_bb
 * @dump: true / false
 *
 * Do bb dump as base64 string before execbuf call.
 */
void intel_bb_set_dump_base64(struct intel_bb *ibb, bool dump)
{
	ibb->dump_base64 = dump;
}

static int __compare_objects(const void *p1, const void *p2)
{
	const struct drm_i915_gem_exec_object2 *o1 = p1, *o2 = p2;

	return (int) ((int64_t) o1->handle - (int64_t) o2->handle);
}

static struct drm_i915_gem_exec_object2 *
__add_to_cache(struct intel_bb *ibb, uint32_t handle)
{
	struct drm_i915_gem_exec_object2 **found, *object;

	object = malloc(sizeof(*object));
	object->handle = handle;
	found = tsearch((void *) object, &ibb->root, __compare_objects);

	if (*found == object) {
		memset(object, 0, sizeof(*object));
		object->handle = handle;
		object->alignment = ibb->alignment;
	} else {
		free(object);
		object = *found;
	}

	return object;
}

static int __compare_handles(const void *p1, const void *p2)
{
	return (int) (*(int32_t *) p1 - *(int32_t *) p2);
}

static void __add_to_objects(struct intel_bb *ibb,
			     struct drm_i915_gem_exec_object2 *object)
{
	uint32_t i, **found, *handle;

	handle = malloc(sizeof(*handle));
	*handle = object->handle;
	found = tsearch((void *) handle, &ibb->current, __compare_handles);

	if (*found == handle) {
		__reallocate_objects(ibb);
		i = ibb->num_objects++;
		ibb->objects[i] = object;
	} else {
		free(handle);
	}
}

/**
 * intel_bb_add_object:
 * @ibb: pointer to intel_bb
 * @handle: which handle to add to objects array
 * @offset: presumed offset of the object when no relocation is enforced
 * @write: does a handle is a render target
 *
 * Function adds or updates execobj slot in bb objects array and
 * in the object tree. When object is a render target it has to
 * be marked with EXEC_OBJECT_WRITE flag.
 */
struct drm_i915_gem_exec_object2 *
intel_bb_add_object(struct intel_bb *ibb, uint32_t handle,
		    uint64_t offset, bool write)
{
	struct drm_i915_gem_exec_object2 *object;

	object = __add_to_cache(ibb, handle);
	__add_to_objects(ibb, object);

	/* Limit current offset to gtt size */
	object->offset = offset;
	if (offset != INTEL_BUF_INVALID_ADDRESS)
		object->offset = gen8_canonical_addr(offset & (ibb->gtt_size - 1));

	if (object->offset == INTEL_BUF_INVALID_ADDRESS)
		object->offset = __intel_bb_propose_offset(ibb);

	if (write)
		object->flags |= EXEC_OBJECT_WRITE;

	if (ibb->supports_48b_address)
		object->flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;

	return object;
}

struct drm_i915_gem_exec_object2 *
intel_bb_add_intel_buf(struct intel_bb *ibb, struct intel_buf *buf, bool write)
{
	struct drm_i915_gem_exec_object2 *obj;

	obj = intel_bb_add_object(ibb, buf->handle, buf->addr.offset, write);

	/* For compressed surfaces ensure address is aligned to 64KB */
	if (ibb->gen >= 12 && buf->compression) {
		obj->offset &= ~(0x10000 - 1);
		obj->alignment = 0x10000;
	}

	/* For gen3 ensure tiled buffers are aligned to power of two size */
	if (ibb->gen == 3 && buf->tiling) {
		uint64_t alignment = 1024 * 1024;

		while (alignment < buf->surface[0].size)
			alignment <<= 1;
		obj->offset &= ~(alignment - 1);
		obj->alignment = alignment;
	}

	/* Update address in intel_buf buffer */
	buf->addr.offset = obj->offset;

	return obj;
}

struct drm_i915_gem_exec_object2 *
intel_bb_find_object(struct intel_bb *ibb, uint32_t handle)
{
	struct drm_i915_gem_exec_object2 object = { .handle = handle };
	struct drm_i915_gem_exec_object2 **found;

	found = tfind((void *) &object, &ibb->root, __compare_objects);

	return *found;
}

bool
intel_bb_object_set_flag(struct intel_bb *ibb, uint32_t handle, uint64_t flag)
{
	struct drm_i915_gem_exec_object2 object = { .handle = handle };
	struct drm_i915_gem_exec_object2 **found;

	found = tfind((void *) &object, &ibb->root, __compare_objects);
	if (!found) {
		igt_warn("Trying to set fence on not found handle: %u\n",
			 handle);
		return false;
	}

	(*found)->flags |= flag;

	return true;
}

bool
intel_bb_object_clear_flag(struct intel_bb *ibb, uint32_t handle, uint64_t flag)
{
	struct drm_i915_gem_exec_object2 object = { .handle = handle };
	struct drm_i915_gem_exec_object2 **found;

	found = tfind((void *) &object, &ibb->root, __compare_objects);
	if (!found) {
		igt_warn("Trying to set fence on not found handle: %u\n",
			 handle);
		return false;
	}

	(*found)->flags &= ~flag;

	return true;
}

/*
 * intel_bb_add_reloc:
 * @ibb: pointer to intel_bb
 * @to_handle: object handle in which do relocation
 * @handle: object handle which address will be taken to patch the @to_handle
 * @read_domains: gem domain bits for the relocation
 * @write_domain: gem domain bit for the relocation
 * @delta: delta value to add to @buffer's gpu address
 * @offset: offset within bb to be patched
 * @presumed_offset: address of the object in address space. If -1 is passed
 * then final offset of the object will be randomized (for no-reloc bb) or
 * 0 (for reloc bb, in that case reloc.presumed_offset will be -1). In
 * case address is known it should passed in @presumed_offset (for no-reloc).
 *
 * Function allocates additional relocation slot in reloc array for a handle.
 * It also implicitly adds handle in the objects array if object doesn't
 * exists but doesn't mark it as a render target.
 */
static uint64_t intel_bb_add_reloc(struct intel_bb *ibb,
				   uint32_t to_handle,
				   uint32_t handle,
				   uint32_t read_domains,
				   uint32_t write_domain,
				   uint64_t delta,
				   uint64_t offset,
				   uint64_t presumed_offset)
{
	struct drm_i915_gem_relocation_entry *relocs;
	struct drm_i915_gem_exec_object2 *object, *to_object;
	uint32_t i;

	object = intel_bb_add_object(ibb, handle, presumed_offset, false);

	/* For ibb we have relocs allocated in chunks */
	if (to_handle == ibb->handle) {
		relocs = ibb->relocs;
		if (ibb->num_relocs == ibb->allocated_relocs) {
			ibb->allocated_relocs += 4096 / sizeof(*relocs);
			relocs = realloc(relocs, sizeof(*relocs) * ibb->allocated_relocs);
			igt_assert(relocs);
			ibb->relocs = relocs;
		}
		i = ibb->num_relocs++;
	} else {
		to_object = intel_bb_find_object(ibb, to_handle);
		igt_assert_f(to_object, "object has to be added to ibb first!\n");

		i = to_object->relocation_count++;
		relocs = from_user_pointer(to_object->relocs_ptr);
		relocs = realloc(relocs, sizeof(*relocs) * to_object->relocation_count);
		to_object->relocs_ptr = to_user_pointer(relocs);
		igt_assert(relocs);
	}

	memset(&relocs[i], 0, sizeof(*relocs));
	relocs[i].target_handle = handle;
	relocs[i].read_domains = read_domains;
	relocs[i].write_domain = write_domain;
	relocs[i].delta = delta;
	relocs[i].offset = offset;
	if (ibb->enforce_relocs)
		relocs[i].presumed_offset = -1;
	else
		relocs[i].presumed_offset = object->offset;

	igt_debug("add reloc: to_handle: %u, handle: %u, r/w: 0x%x/0x%x, "
		  "delta: 0x%" PRIx64 ", "
		  "offset: 0x%" PRIx64 ", "
		  "poffset: %p\n",
		  to_handle, handle, read_domains, write_domain,
		  delta, offset,
		  from_user_pointer(relocs[i].presumed_offset));

	return object->offset;
}

static uint64_t __intel_bb_emit_reloc(struct intel_bb *ibb,
				      uint32_t to_handle,
				      uint32_t to_offset,
				      uint32_t handle,
				      uint32_t read_domains,
				      uint32_t write_domain,
				      uint64_t delta,
				      uint64_t presumed_offset)
{
	uint64_t address;

	igt_assert(ibb);

	address = intel_bb_add_reloc(ibb, to_handle, handle,
				     read_domains, write_domain,
				     delta, to_offset,
				     presumed_offset);

	intel_bb_out(ibb, delta + address);
	if (ibb->gen >= 8)
		intel_bb_out(ibb, address >> 32);

	return address;
}

/**
 * intel_bb_emit_reloc:
 * @ibb: pointer to intel_bb
 * @handle: object handle which address will be taken to patch the bb
 * @read_domains: gem domain bits for the relocation
 * @write_domain: gem domain bit for the relocation
 * @delta: delta value to add to @buffer's gpu address
 * @presumed_offset: address of the object in address space. If -1 is passed
 * then final offset of the object will be randomized (for no-reloc bb) or
 * 0 (for reloc bb, in that case reloc.presumed_offset will be -1). In
 * case address is known it should passed in @presumed_offset (for no-reloc).
 * @write: does a handle is a render target
 *
 * Function prepares relocation (execobj if required + reloc) and emits
 * offset in bb. For I915_EXEC_NO_RELOC presumed_offset is a hint we already
 * have object in valid place and relocation step can be skipped in this case.
 *
 * Note: delta is value added to address, mostly used when some instructions
 * require modify-bit set to apply change. Which delta is valid depends
 * on instruction (see instruction specification).
 */
uint64_t intel_bb_emit_reloc(struct intel_bb *ibb,
			     uint32_t handle,
			     uint32_t read_domains,
			     uint32_t write_domain,
			     uint64_t delta,
			     uint64_t presumed_offset)
{
	igt_assert(ibb);

	return __intel_bb_emit_reloc(ibb, ibb->handle, intel_bb_offset(ibb),
				     handle, read_domains, write_domain,
				     delta, presumed_offset);
}

uint64_t intel_bb_emit_reloc_fenced(struct intel_bb *ibb,
				    uint32_t handle,
				    uint32_t read_domains,
				    uint32_t write_domain,
				    uint64_t delta,
				    uint64_t presumed_offset)
{
	uint64_t address;

	address = intel_bb_emit_reloc(ibb, handle, read_domains, write_domain,
				      delta, presumed_offset);

	intel_bb_object_set_flag(ibb, handle, EXEC_OBJECT_NEEDS_FENCE);

	return address;
}

/**
 * intel_bb_offset_reloc:
 * @ibb: pointer to intel_bb
 * @handle: object handle which address will be taken to patch the bb
 * @read_domains: gem domain bits for the relocation
 * @write_domain: gem domain bit for the relocation
 * @offset: offset within bb to be patched
 * @presumed_offset: address of the object in address space. If -1 is passed
 * then final offset of the object will be randomized (for no-reloc bb) or
 * 0 (for reloc bb, in that case reloc.presumed_offset will be -1). In
 * case address is known it should passed in @presumed_offset (for no-reloc).
 *
 * Function prepares relocation (execobj if required + reloc). It it used
 * for editing batchbuffer via modifying structures. It means when we're
 * preparing batchbuffer it is more descriptive to edit the structure
 * than emitting dwords. But it require for some fields to point the
 * relocation. For that case @offset is passed by the user and it points
 * to the offset in bb where the relocation will be applied.
 */
uint64_t intel_bb_offset_reloc(struct intel_bb *ibb,
			       uint32_t handle,
			       uint32_t read_domains,
			       uint32_t write_domain,
			       uint32_t offset,
			       uint64_t presumed_offset)
{
	igt_assert(ibb);

	return intel_bb_add_reloc(ibb, ibb->handle, handle,
				  read_domains, write_domain,
				  0, offset, presumed_offset);
}

uint64_t intel_bb_offset_reloc_with_delta(struct intel_bb *ibb,
					  uint32_t handle,
					  uint32_t read_domains,
					  uint32_t write_domain,
					  uint32_t delta,
					  uint32_t offset,
					  uint64_t presumed_offset)
{
	igt_assert(ibb);

	return intel_bb_add_reloc(ibb, ibb->handle, handle,
				  read_domains, write_domain,
				  delta, offset, presumed_offset);
}

uint64_t intel_bb_offset_reloc_to_object(struct intel_bb *ibb,
					 uint32_t to_handle,
					 uint32_t handle,
					 uint32_t read_domains,
					 uint32_t write_domain,
					 uint32_t delta,
					 uint32_t offset,
					 uint64_t presumed_offset)
{
	igt_assert(ibb);

	return intel_bb_add_reloc(ibb, to_handle, handle,
				  read_domains, write_domain,
				  delta, offset, presumed_offset);
}

static void intel_bb_dump_execbuf(struct intel_bb *ibb,
				  struct drm_i915_gem_execbuffer2 *execbuf)
{
	struct drm_i915_gem_exec_object2 *objects;
	struct drm_i915_gem_relocation_entry *relocs, *reloc;
	int i, j;
	uint64_t address;

	igt_info("execbuf batch len: %u, start offset: 0x%x, "
		 "DR1: 0x%x, DR4: 0x%x, "
		 "num clip: %u, clipptr: 0x%llx, "
		 "flags: 0x%llx, rsvd1: 0x%llx, rsvd2: 0x%llx\n",
		 execbuf->batch_len, execbuf->batch_start_offset,
		 execbuf->DR1, execbuf->DR4,
		 execbuf->num_cliprects, execbuf->cliprects_ptr,
		 execbuf->flags, execbuf->rsvd1, execbuf->rsvd2);

	igt_info("execbuf buffer_count: %d\n", execbuf->buffer_count);
	for (i = 0; i < execbuf->buffer_count; i++) {
		objects = &((struct drm_i915_gem_exec_object2 *)
			    from_user_pointer(execbuf->buffers_ptr))[i];
		relocs = from_user_pointer(objects->relocs_ptr);
		address = objects->offset;
		igt_info(" [%d] handle: %u, reloc_count: %d, reloc_ptr: %p, "
			 "align: 0x%llx, offset: 0x%" PRIx64 ", flags: 0x%llx, "
			 "rsvd1: 0x%llx, rsvd2: 0x%llx\n",
			 i, objects->handle, objects->relocation_count,
			 relocs,
			 objects->alignment,
			 address,
			 objects->flags,
			 objects->rsvd1, objects->rsvd2);
		if (objects->relocation_count) {
			igt_info("\texecbuf relocs:\n");
			for (j = 0; j < objects->relocation_count; j++) {
				reloc = &relocs[j];
				address = reloc->presumed_offset;
				igt_info("\t [%d] target handle: %u, "
					 "offset: 0x%llx, delta: 0x%x, "
					 "presumed_offset: 0x%" PRIx64 ", "
					 "read_domains: 0x%x, "
					 "write_domain: 0x%x\n",
					 j, reloc->target_handle,
					 reloc->offset, reloc->delta,
					 address,
					 reloc->read_domains,
					 reloc->write_domain);
			}
		}
	}
}

static void intel_bb_dump_base64(struct intel_bb *ibb, int linelen)
{
	int outsize;
	gchar *str, *pos;

	igt_info("--- bb ---\n");
	pos = str = g_base64_encode((const guchar *) ibb->batch, ibb->size);
	outsize = strlen(str);

	while (outsize > 0) {
		igt_info("%.*s\n", min(outsize, linelen), pos);
		pos += linelen;
		outsize -= linelen;
	}

	free(str);
}

static void print_node(const void *node, VISIT which, int depth)
{
	const struct drm_i915_gem_exec_object2 *object =
		*(const struct drm_i915_gem_exec_object2 **) node;
	(void) depth;

	switch (which) {
	case preorder:
	case endorder:
		break;

	case postorder:
	case leaf:
		igt_info("\t handle: %u, offset: 0x%" PRIx64 "\n",
			 object->handle, (uint64_t) object->offset);
		break;
	}
}

static struct drm_i915_gem_exec_object2 *
create_objects_array(struct intel_bb *ibb)
{
	struct drm_i915_gem_exec_object2 *objects;
	uint32_t i;

	objects = malloc(sizeof(*objects) * ibb->num_objects);
	igt_assert(objects);

	for (i = 0; i < ibb->num_objects; i++)
		objects[i] = *(ibb->objects[i]);

	return objects;
}

static void update_offsets(struct intel_bb *ibb,
			   struct drm_i915_gem_exec_object2 *objects)
{
	struct drm_i915_gem_exec_object2 *object;
	uint32_t i;

	for (i = 0; i < ibb->num_objects; i++) {
		object = intel_bb_find_object(ibb, objects[i].handle);
		igt_assert(object);

		object->offset = objects[i].offset;
	}
}

#define LINELEN 76
/*
 * @__intel_bb_exec:
 * @ibb: pointer to intel_bb
 * @end_offset: offset of the last instruction in the bb
 * @flags: flags passed directly to execbuf
 * @ctx: context
 * @sync: if true wait for execbuf completion, otherwise caller is responsible
 * to wait for completion
 *
 * Returns: 0 on success, otherwise errno.
 *
 * Note: In this step execobj for bb is allocated and inserted to the objects
 * array.
*/
int __intel_bb_exec(struct intel_bb *ibb, uint32_t end_offset,
		    uint32_t ctx, uint64_t flags, bool sync)
{
	struct drm_i915_gem_execbuffer2 execbuf;
	struct drm_i915_gem_exec_object2 *objects;
	int ret, fence, new_fence;

	ibb->objects[0]->relocs_ptr = to_user_pointer(ibb->relocs);
	ibb->objects[0]->relocation_count = ibb->num_relocs;
	ibb->objects[0]->handle = ibb->handle;

	gem_write(ibb->i915, ibb->handle, 0, ibb->batch, ibb->size);

	memset(&execbuf, 0, sizeof(execbuf));
	objects = create_objects_array(ibb);
	execbuf.buffers_ptr = (uintptr_t) objects;
	execbuf.buffer_count = ibb->num_objects;
	execbuf.batch_len = end_offset;
	execbuf.rsvd1 = ibb->ctx = ctx;
	execbuf.flags = flags | I915_EXEC_BATCH_FIRST | I915_EXEC_FENCE_OUT;
	if (ibb->enforce_relocs)
		execbuf.flags &= ~I915_EXEC_NO_RELOC;
	execbuf.rsvd2 = 0;

	if (ibb->dump_base64)
		intel_bb_dump_base64(ibb, LINELEN);

	ret = __gem_execbuf_wr(ibb->i915, &execbuf);
	if (ret) {
		intel_bb_dump_execbuf(ibb, &execbuf);
		free(objects);
		return ret;
	}

	/* Update addresses in the cache */
	update_offsets(ibb, objects);

	/* Save/merge fences */
	fence = execbuf.rsvd2 >> 32;

	if (ibb->fence < 0) {
		ibb->fence = fence;
	} else {
		new_fence = sync_fence_merge(ibb->fence, fence);
		close(ibb->fence);
		close(fence);
		ibb->fence = new_fence;
	}

	if (sync || ibb->debug)
		igt_assert(intel_bb_sync(ibb) == 0);

	if (ibb->debug) {
		intel_bb_dump_execbuf(ibb, &execbuf);
		if (intel_bb_debug_tree) {
			igt_info("\nTree:\n");
			twalk(ibb->root, print_node);
		}
	}

	free(objects);

	return 0;
}

/**
 * intel_bb_exec:
 * @ibb: pointer to intel_bb
 * @end_offset: offset of the last instruction in the bb
 * @flags: flags passed directly to execbuf
 * @sync: if true wait for execbuf completion, otherwise caller is responsible
 * to wait for completion
 *
 * Do execbuf with default context. Asserts on failure.
*/
void intel_bb_exec(struct intel_bb *ibb, uint32_t end_offset,
		   uint64_t flags, bool sync)
{
	igt_assert_eq(__intel_bb_exec(ibb, end_offset, 0, flags, sync), 0);
}

/*
 * intel_bb_exec_with_context:
 * @ibb: pointer to intel_bb
 * @end_offset: offset of the last instruction in the bb
 * @flags: flags passed directly to execbuf
 * @ctx: context
 * @sync: if true wait for execbuf completion, otherwise caller is responsible
 * to wait for completion
 *
 * Do execbuf with context @context.
*/
void intel_bb_exec_with_context(struct intel_bb *ibb, uint32_t end_offset,
				uint32_t ctx, uint64_t flags, bool sync)
{
	igt_assert_eq(__intel_bb_exec(ibb, end_offset, ctx, flags, sync), 0);
}

/**
 * intel_bb_get_object_address:
 * @ibb: pointer to intel_bb
 * @handle: object handle
 *
 * When objects addresses are previously pinned and we don't want to relocate
 * we need to acquire them from previous execbuf. Function returns previous
 * object offset for @handle or 0 if object is not found.
 */
uint64_t intel_bb_get_object_offset(struct intel_bb *ibb, uint32_t handle)
{
	struct drm_i915_gem_exec_object2 object = { .handle = handle };
	struct drm_i915_gem_exec_object2 **found;
	uint64_t address;

	igt_assert(ibb);

	found = tfind((void *)&object, &ibb->root, __compare_objects);
	if (!found)
		return INTEL_BUF_INVALID_ADDRESS;

	address = (*found)->offset;

	if (address == INTEL_BUF_INVALID_ADDRESS)
		return address;

	return address & (ibb->gtt_size - 1);
}

/**
 * intel_bb_object_offset_to_buf:
 * @ibb: pointer to intel_bb
 * @buf: buffer we want to store last exec offset and context id
 *
 * Copy object offset used in the batch to intel_buf to allow caller prepare
 * other batch likely without relocations.
 */
bool intel_bb_object_offset_to_buf(struct intel_bb *ibb, struct intel_buf *buf)
{
	struct drm_i915_gem_exec_object2 object = { .handle = buf->handle };
	struct drm_i915_gem_exec_object2 **found;

	igt_assert(ibb);
	igt_assert(buf);

	found = tfind((void *)&object, &ibb->root, __compare_objects);
	if (!found) {
		buf->addr.offset = 0;
		buf->addr.ctx = 0;

		return false;
	}

	buf->addr.offset = (*found)->offset & (ibb->gtt_size - 1);
	buf->addr.ctx = ibb->ctx;

	return true;
}

/*
 * intel_bb_emit_bbe:
 * @ibb: batchbuffer
 *
 * Outputs MI_BATCH_BUFFER_END and ensures batch is properly aligned.
 */
uint32_t intel_bb_emit_bbe(struct intel_bb *ibb)
{
	/* Mark the end of the buffer. */
	intel_bb_out(ibb, MI_BATCH_BUFFER_END);
	intel_bb_ptr_align(ibb, 8);

	return intel_bb_offset(ibb);
}

/*
 * intel_bb_emit_flush_common:
 * @ibb: batchbuffer
 *
 * Emits instructions which completes batch buffer.
 *
 * Returns: offset in batch buffer where there's end of instructions.
 */
uint32_t intel_bb_emit_flush_common(struct intel_bb *ibb)
{
	if (intel_bb_offset(ibb) == 0)
		return 0;

	if (ibb->gen == 5) {
		/*
		 * emit gen5 w/a without batch space checks - we reserve that
		 * already.
		 */
		intel_bb_out(ibb, CMD_POLY_STIPPLE_OFFSET << 16);
		intel_bb_out(ibb, 0);
	}

	/* Round batchbuffer usage to 2 DWORDs. */
	if ((intel_bb_offset(ibb) & 4) == 0)
		intel_bb_out(ibb, 0);

	intel_bb_emit_bbe(ibb);

	return intel_bb_offset(ibb);
}

static void intel_bb_exec_with_context_ring(struct intel_bb *ibb,
					    uint32_t ctx, uint32_t ring)
{
	intel_bb_exec_with_context(ibb, intel_bb_offset(ibb), ctx,
				   ring | I915_EXEC_NO_RELOC,
				   false);
	intel_bb_reset(ibb, false);
}

/*
 * intel_bb_flush:
 * @ibb: batchbuffer
 * @ctx: context
 * @ring: ring
 *
 * If batch is not empty emit batch buffer end, execute on specified
 * context, ring then reset the batch.
 */
void intel_bb_flush(struct intel_bb *ibb, uint32_t ctx, uint32_t ring)
{
	if (intel_bb_emit_flush_common(ibb) == 0)
		return;

	intel_bb_exec_with_context_ring(ibb, ctx, ring);
}

static void __intel_bb_flush_render_with_context(struct intel_bb *ibb,
						 uint32_t ctx)
{
	uint32_t ring = I915_EXEC_RENDER;

	if (intel_bb_emit_flush_common(ibb) == 0)
		return;

	intel_bb_exec_with_context_ring(ibb, ctx, ring);
}

/*
 * intel_bb_flush_render:
 * @ibb: batchbuffer
 *
 * If batch is not empty emit batch buffer end, execute on render ring
 * and reset the batch.
 * Context used to execute is previous batch context.
 */
void intel_bb_flush_render(struct intel_bb *ibb)
{
	__intel_bb_flush_render_with_context(ibb, ibb->ctx);
}

/*
 * intel_bb_flush_render_with_context:
 * @ibb: batchbuffer
 * @ctx: context
 *
 * If batch is not empty emit batch buffer end, execute on render ring with @ctx
 * and reset the batch.
 */
void intel_bb_flush_render_with_context(struct intel_bb *ibb, uint32_t ctx)
{
	__intel_bb_flush_render_with_context(ibb, ctx);
}

static void __intel_bb_flush_blit_with_context(struct intel_bb *ibb,
					       uint32_t ctx)
{
	uint32_t ring = I915_EXEC_DEFAULT;

	if (intel_bb_emit_flush_common(ibb) == 0)
		return;

	if (HAS_BLT_RING(ibb->devid))
		ring = I915_EXEC_BLT;

	intel_bb_exec_with_context_ring(ibb, ctx, ring);
}

/*
 * intel_bb_flush_blit:
 * @ibb: batchbuffer
 *
 * If batch is not empty emit batch buffer end, execute on default/blit ring
 * (depends on gen) and reset the batch.
 * Context used to execute is previous batch context.
 */
void intel_bb_flush_blit(struct intel_bb *ibb)
{
	__intel_bb_flush_blit_with_context(ibb, ibb->ctx);
}

/*
 * intel_bb_flush_blit_with_context:
 * @ibb: batchbuffer
 * @ctx: context
 *
 * If batch is not empty emit batch buffer end, execute on default/blit ring
 * (depends on gen) with @ctx and reset the batch.
 */
void intel_bb_flush_blit_with_context(struct intel_bb *ibb, uint32_t ctx)
{
	__intel_bb_flush_blit_with_context(ibb, ctx);
}

/*
 * intel_bb_copy_data:
 * @ibb: batchbuffer
 * @data: pointer of data which should be copied into batch
 * @bytes: number of bytes to copy, must be dword multiplied
 * @align: alignment in the batch
 *
 * Function copies @bytes of data pointed by @data into batch buffer.
 */
uint32_t intel_bb_copy_data(struct intel_bb *ibb,
			    const void *data, unsigned int bytes,
			    uint32_t align)
{
	uint32_t *subdata, offset;

	igt_assert((bytes & 3) == 0);

	intel_bb_ptr_align(ibb, align);
	offset = intel_bb_offset(ibb);
	igt_assert(offset + bytes < ibb->size);

	subdata = intel_bb_ptr(ibb);
	memcpy(subdata, data, bytes);
	intel_bb_ptr_add(ibb, bytes);

	return offset;
}

/*
 * intel_bb_blit_start:
 * @ibb: batchbuffer
 * @flags: flags to blit command
 *
 * Function emits XY_SRC_COPY_BLT instruction with size appropriate size
 * which depend on gen.
 */
void intel_bb_blit_start(struct intel_bb *ibb, uint32_t flags)
{
	intel_bb_out(ibb, XY_SRC_COPY_BLT_CMD |
		     XY_SRC_COPY_BLT_WRITE_ALPHA |
		     XY_SRC_COPY_BLT_WRITE_RGB |
		     flags |
		     (6 + 2 * (ibb->gen >= 8)));
}

/*
 * intel_bb_emit_blt_copy:
 * @ibb: batchbuffer
 * @src: source buffer (intel_buf)
 * @src_x1: source x1 position
 * @src_y1: source y1 position
 * @src_pitch: source pitch
 * @dst: destination buffer (intel_buf)
 * @dst_x1: destination x1 position
 * @dst_y1: destination y1 position
 * @dst_pitch: destination pitch
 * @width: width of data to copy
 * @height: height of data to copy
 *
 * Function emits complete blit command.
 */
void intel_bb_emit_blt_copy(struct intel_bb *ibb,
			    struct intel_buf *src,
			    int src_x1, int src_y1, int src_pitch,
			    struct intel_buf *dst,
			    int dst_x1, int dst_y1, int dst_pitch,
			    int width, int height, int bpp)
{
	const int gen = ibb->gen;
	uint32_t cmd_bits = 0;
	uint32_t br13_bits;
	uint32_t mask;

	igt_assert(bpp*(src_x1 + width) <= 8*src_pitch);
	igt_assert(bpp*(dst_x1 + width) <= 8*dst_pitch);
	igt_assert(src_pitch * (src_y1 + height) <= src->surface[0].size);
	igt_assert(dst_pitch * (dst_y1 + height) <= dst->surface[0].size);

	if (gen >= 4 && src->tiling != I915_TILING_NONE) {
		src_pitch /= 4;
		cmd_bits |= XY_SRC_COPY_BLT_SRC_TILED;
	}

	if (gen >= 4 && dst->tiling != I915_TILING_NONE) {
		dst_pitch /= 4;
		cmd_bits |= XY_SRC_COPY_BLT_DST_TILED;
	}

	CHECK_RANGE(src_x1); CHECK_RANGE(src_y1);
	CHECK_RANGE(dst_x1); CHECK_RANGE(dst_y1);
	CHECK_RANGE(width); CHECK_RANGE(height);
	CHECK_RANGE(src_x1 + width); CHECK_RANGE(src_y1 + height);
	CHECK_RANGE(dst_x1 + width); CHECK_RANGE(dst_y1 + height);
	CHECK_RANGE(src_pitch); CHECK_RANGE(dst_pitch);

	br13_bits = 0;
	switch (bpp) {
	case 8:
		break;
	case 16:		/* supporting only RGB565, not ARGB1555 */
		br13_bits |= 1 << 24;
		break;
	case 32:
		br13_bits |= 3 << 24;
		cmd_bits |= (XY_SRC_COPY_BLT_WRITE_ALPHA |
			     XY_SRC_COPY_BLT_WRITE_RGB);
		break;
	default:
		igt_fail(IGT_EXIT_FAILURE);
	}

	if ((src->tiling | dst->tiling) >= I915_TILING_Y) {
		intel_bb_out(ibb, MI_LOAD_REGISTER_IMM);
		intel_bb_out(ibb, BCS_SWCTRL);

		mask = (BCS_SRC_Y | BCS_DST_Y) << 16;
		if (src->tiling == I915_TILING_Y)
			mask |= BCS_SRC_Y;
		if (dst->tiling == I915_TILING_Y)
			mask |= BCS_DST_Y;
		intel_bb_out(ibb, mask);
	}

	intel_bb_add_intel_buf(ibb, src, false);
	intel_bb_add_intel_buf(ibb, dst, true);

	intel_bb_blit_start(ibb, cmd_bits);
	intel_bb_out(ibb, (br13_bits) |
		     (0xcc << 16) | /* copy ROP */
		     dst_pitch);
	intel_bb_out(ibb, (dst_y1 << 16) | dst_x1); /* dst x1,y1 */
	intel_bb_out(ibb, ((dst_y1 + height) << 16) | (dst_x1 + width)); /* dst x2,y2 */
	intel_bb_emit_reloc_fenced(ibb, dst->handle,
				   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
				   0, dst->addr.offset);
	intel_bb_out(ibb, (src_y1 << 16) | src_x1); /* src x1,y1 */
	intel_bb_out(ibb, src_pitch);
	intel_bb_emit_reloc_fenced(ibb, src->handle,
				   I915_GEM_DOMAIN_RENDER, 0,
				   0, src->addr.offset);

	if (gen >= 6 && src->handle == dst->handle) {
		intel_bb_out(ibb, XY_SETUP_CLIP_BLT_CMD);
		intel_bb_out(ibb, 0);
		intel_bb_out(ibb, 0);
	}

	if ((src->tiling | dst->tiling) >= I915_TILING_Y) {
		igt_assert(ibb->gen >= 6);
		intel_bb_out(ibb, MI_FLUSH_DW | 2);
		intel_bb_out(ibb, 0);
		intel_bb_out(ibb, 0);
		intel_bb_out(ibb, 0);

		intel_bb_out(ibb, MI_LOAD_REGISTER_IMM);
		intel_bb_out(ibb, BCS_SWCTRL);
		intel_bb_out(ibb, (BCS_SRC_Y | BCS_DST_Y) << 16);
	}
}

void intel_bb_blt_copy(struct intel_bb *ibb,
		       struct intel_buf *src,
		       int src_x1, int src_y1, int src_pitch,
		       struct intel_buf *dst,
		       int dst_x1, int dst_y1, int dst_pitch,
		       int width, int height, int bpp)
{
	intel_bb_emit_blt_copy(ibb, src, src_x1, src_y1, src_pitch,
			       dst, dst_x1, dst_y1, dst_pitch,
			       width, height, bpp);
	intel_bb_flush_blit(ibb);
}

/**
 * igt_get_huc_copyfunc:
 * @devid: pci device id
 *
 * Returns:
 *
 * The platform-specific huc copy function pointer for the device specified
 * with @devid. Will return NULL when no media spin function is implemented.
 */
igt_huc_copyfunc_t igt_get_huc_copyfunc(int devid)
{
	igt_huc_copyfunc_t copy = NULL;

	if (IS_GEN12(devid) || IS_GEN11(devid) || IS_GEN9(devid))
		copy = gen9_huc_copyfunc;

	return copy;
}