/*
 * Copyright © 2016 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 */

#include <time.h>
#include <signal.h>
#include <pthread.h>

#include <i915_drm.h>

#include "igt_core.h"
#include "igt_dummyload.h"
#include "igt_gt.h"
#include "intel_chipset.h"
#include "intel_reg.h"
#include "ioctl_wrappers.h"
#include "sw_sync.h"

/**
 * SECTION:igt_dummyload
 * @short_description: Library for submitting GPU workloads
 * @title: Dummyload
 * @include: igt.h
 *
 * A lot of igt testcases need some GPU workload to make sure a race window is
 * big enough. Unfortunately having a fixed amount of workload leads to
 * spurious test failures or overly long runtimes on some fast/slow platforms.
 * This library contains functionality to submit GPU workloads that should
 * consume exactly a specific amount of time.
 */

#define LOCAL_I915_EXEC_BSD_SHIFT      (13)
#define LOCAL_I915_EXEC_BSD_MASK       (3 << LOCAL_I915_EXEC_BSD_SHIFT)

#define ENGINE_MASK  (I915_EXEC_RING_MASK | LOCAL_I915_EXEC_BSD_MASK)

#define MI_ARB_CHK (0x5 << 23)

static const int BATCH_SIZE = 4096;
static IGT_LIST(spin_list);
static pthread_mutex_t list_lock = PTHREAD_MUTEX_INITIALIZER;

static void
fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
	   uint32_t gem_handle, uint32_t offset,
	   uint32_t read_domains, uint32_t write_domains)
{
	reloc->target_handle = gem_handle;
	reloc->offset = offset * sizeof(uint32_t);
	reloc->read_domains = read_domains;
	reloc->write_domain = write_domains;
}

static int emit_recursive_batch(igt_spin_t *spin,
				int fd, uint32_t ctx, unsigned engine,
				uint32_t dep, bool out_fence)
{
#define SCRATCH 0
#define BATCH 1
	const int gen = intel_gen(intel_get_drm_devid(fd));
	struct drm_i915_gem_exec_object2 obj[2];
	struct drm_i915_gem_relocation_entry relocs[2];
	struct drm_i915_gem_execbuffer2 execbuf;
	unsigned int engines[16];
	unsigned int nengine;
	int fence_fd = -1;
	uint32_t *batch;
	int i;

	nengine = 0;
	if (engine == -1) {
		for_each_engine(fd, engine)
			if (engine)
				engines[nengine++] = engine;
	} else {
		gem_require_ring(fd, engine);
		engines[nengine++] = engine;
	}
	igt_require(nengine);

	memset(&execbuf, 0, sizeof(execbuf));
	memset(obj, 0, sizeof(obj));
	memset(relocs, 0, sizeof(relocs));

	obj[BATCH].handle = gem_create(fd, BATCH_SIZE);
	batch = __gem_mmap__wc(fd, obj[BATCH].handle,
			       0, BATCH_SIZE, PROT_WRITE);
	if (!batch)
		batch = __gem_mmap__gtt(fd, obj[BATCH].handle,
				       	BATCH_SIZE, PROT_WRITE);
	gem_set_domain(fd, obj[BATCH].handle,
			I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
	execbuf.buffer_count++;

	if (dep) {
		/* dummy write to dependency */
		obj[SCRATCH].handle = dep;
		fill_reloc(&relocs[obj[BATCH].relocation_count++],
			   dep, 1020,
			   I915_GEM_DOMAIN_RENDER,
			   I915_GEM_DOMAIN_RENDER);
		execbuf.buffer_count++;
	}

	spin->batch = batch;
	spin->handle = obj[BATCH].handle;

	/* Allow ourselves to be preempted */
	*batch++ = MI_ARB_CHK;

	/* Pad with a few nops so that we do not completely hog the system.
	 *
	 * Part of the attraction of using a recursive batch is that it is
	 * hard on the system (executing the "function" call is apparently
	 * quite expensive). However, the GPU may hog the entire system for
	 * a few minutes, preventing even NMI. Quite why this is so is unclear,
	 * but presumably it relates to the PM_INTRMSK workaround on gen6/gen7.
	 * If we give the system a break by having the GPU execute a few nops
	 * between function calls, that appears enough to keep SNB out of
	 * trouble. See https://bugs.freedesktop.org/show_bug.cgi?id=102262
	 */
	batch += 1000;

	/* recurse */
	fill_reloc(&relocs[obj[BATCH].relocation_count],
		   obj[BATCH].handle, (batch - spin->batch) + 1,
		   I915_GEM_DOMAIN_COMMAND, 0);
	if (gen >= 8) {
		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
		*batch++ = 0;
		*batch++ = 0;
	} else if (gen >= 6) {
		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
		*batch++ = 0;
	} else {
		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
		*batch = 0;
		if (gen < 4) {
			*batch |= 1;
			relocs[obj[BATCH].relocation_count].delta = 1;
		}
		batch++;
	}
	obj[BATCH].relocation_count++;
	obj[BATCH].relocs_ptr = to_user_pointer(relocs);

	execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count));
	execbuf.rsvd1 = ctx;

	if (out_fence)
		execbuf.flags |= I915_EXEC_FENCE_OUT;

	for (i = 0; i < nengine; i++) {
		execbuf.flags &= ~ENGINE_MASK;
		execbuf.flags |= engines[i];
		gem_execbuf_wr(fd, &execbuf);
		if (out_fence) {
			int _fd = execbuf.rsvd2 >> 32;

			igt_assert(_fd >= 0);
			if (fence_fd == -1) {
				fence_fd = _fd;
			} else {
				int old_fd = fence_fd;

				fence_fd = sync_fence_merge(old_fd, _fd);
				close(old_fd);
				close(_fd);
			}
			igt_assert(fence_fd >= 0);
		}
	}

	return fence_fd;
}

static igt_spin_t *
___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep,
		      int out_fence)
{
	igt_spin_t *spin;

	spin = calloc(1, sizeof(struct igt_spin));
	igt_assert(spin);

	spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep,
					       out_fence);
	igt_assert(gem_bo_busy(fd, spin->handle));

	pthread_mutex_lock(&list_lock);
	igt_list_add(&spin->link, &spin_list);
	pthread_mutex_unlock(&list_lock);

	return spin;
}

igt_spin_t *
__igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
{
	return ___igt_spin_batch_new(fd, ctx, engine, dep, false);
}

/**
 * igt_spin_batch_new:
 * @fd: open i915 drm file descriptor
 * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
 *          than 0, execute on all available rings.
 * @dep: handle to a buffer object dependency. If greater than 0, add a
 *              relocation entry to this buffer within the batch.
 *
 * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
 * contains the batch's handle that can be waited upon. The returned structure
 * must be passed to igt_spin_batch_free() for post-processing.
 *
 * Returns:
 * Structure with helper internal state for igt_spin_batch_free().
 */
igt_spin_t *
igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep)
{
	igt_require_gem(fd);

	return __igt_spin_batch_new(fd, ctx, engine, dep);
}

igt_spin_t *
__igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
{
	return ___igt_spin_batch_new(fd, ctx, engine, 0, true);
}

/**
 * igt_spin_batch_new_fence:
 * @fd: open i915 drm file descriptor
 * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
 *          than 0, execute on all available rings.
 *
 * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
 * contains the batch's handle that can be waited upon. The returned structure
 * must be passed to igt_spin_batch_free() for post-processing.
 *
 * igt_spin_t will contain an output fence associtated with this batch.
 *
 * Returns:
 * Structure with helper internal state for igt_spin_batch_free().
 */
igt_spin_t *
igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine)
{
	igt_require_gem(fd);
	igt_require(gem_has_exec_fence(fd));

	return __igt_spin_batch_new_fence(fd, ctx, engine);
}

static void notify(union sigval arg)
{
	igt_spin_t *spin = arg.sival_ptr;

	igt_spin_batch_end(spin);
}

/**
 * igt_spin_batch_set_timeout:
 * @spin: spin batch state from igt_spin_batch_new()
 * @ns: amount of time in nanoseconds the batch continues to execute
 *      before finishing.
 *
 * Specify a timeout. This ends the recursive batch associated with @spin after
 * the timeout has elapsed.
 */
void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns)
{
	timer_t timer;
	struct sigevent sev;
	struct itimerspec its;

	igt_assert(ns > 0);
	if (!spin)
		return;

	igt_assert(!spin->timer);

	memset(&sev, 0, sizeof(sev));
	sev.sigev_notify = SIGEV_THREAD;
	sev.sigev_value.sival_ptr = spin;
	sev.sigev_notify_function = notify;
	igt_assert(timer_create(CLOCK_MONOTONIC, &sev, &timer) == 0);
	igt_assert(timer);

	memset(&its, 0, sizeof(its));
	its.it_value.tv_sec = ns / NSEC_PER_SEC;
	its.it_value.tv_nsec = ns % NSEC_PER_SEC;
	igt_assert(timer_settime(timer, 0, &its, NULL) == 0);

	spin->timer = timer;
}

/**
 * igt_spin_batch_end:
 * @spin: spin batch state from igt_spin_batch_new()
 *
 * End the recursive batch associated with @spin manually.
 */
void igt_spin_batch_end(igt_spin_t *spin)
{
	if (!spin)
		return;

	*spin->batch = MI_BATCH_BUFFER_END;
	__sync_synchronize();
}

/**
 * igt_spin_batch_free:
 * @fd: open i915 drm file descriptor
 * @spin: spin batch state from igt_spin_batch_new()
 *
 * This function does the necessary post-processing after starting a recursive
 * batch with igt_spin_batch_new().
 */
void igt_spin_batch_free(int fd, igt_spin_t *spin)
{
	if (!spin)
		return;

	pthread_mutex_lock(&list_lock);
	igt_list_del(&spin->link);
	pthread_mutex_unlock(&list_lock);

	if (spin->timer)
		timer_delete(spin->timer);

	igt_spin_batch_end(spin);
	gem_munmap(spin->batch, BATCH_SIZE);

	gem_close(fd, spin->handle);

	if (spin->out_fence >= 0)
		close(spin->out_fence);

	free(spin);
}

void igt_terminate_spin_batches(void)
{
	struct igt_spin *iter;

	pthread_mutex_lock(&list_lock);
	igt_list_for_each(iter, &spin_list, link)
		igt_spin_batch_end(iter);
	pthread_mutex_unlock(&list_lock);
}